Search in sources :

Example 1 with RowToCsv

use of org.apache.beam.examples.complete.datatokenization.utils.RowToCsv in project beam by apache.

the class DataTokenizationTest method testRowToCSVWithNull.

@Test
public void testRowToCSVWithNull() {
    final String nullableTestSchema = "{\"fields\":[{\"mode\":\"REQUIRED\",\"name\":\"FieldName1\",\"type\":\"STRING\"},{\"mode\":\"NULLABLE\",\"name\":\"FieldName2\",\"type\":\"STRING\"}]}";
    final String expectedCsv = "TestValueOne;null";
    List<Object> values = Lists.newArrayList("TestValueOne", null);
    Schema beamSchema = new SchemasUtils(nullableTestSchema).getBeamSchema();
    Row.Builder rowBuilder = Row.withSchema(beamSchema);
    Row row = rowBuilder.addValues(values).build();
    String csvResult = new RowToCsv(";").getCsvFromRow(row);
    Assert.assertEquals(expectedCsv, csvResult);
}
Also used : SchemasUtils(org.apache.beam.examples.complete.datatokenization.utils.SchemasUtils) RowToCsv(org.apache.beam.examples.complete.datatokenization.utils.RowToCsv) Schema(org.apache.beam.sdk.schemas.Schema) Row(org.apache.beam.sdk.values.Row) Test(org.junit.Test)

Example 2 with RowToCsv

use of org.apache.beam.examples.complete.datatokenization.utils.RowToCsv in project beam by apache.

the class DataTokenizationTest method testRowToCSV.

@Test
public void testRowToCSV() {
    Schema beamSchema = new SchemasUtils(testSchema).getBeamSchema();
    Row.Builder rowBuilder = Row.withSchema(beamSchema);
    Row row = rowBuilder.addValues(new ArrayList<>(Arrays.asList(fields))).build();
    String csvResult = new RowToCsv(";").getCsvFromRow(row);
    Assert.assertEquals(String.join(";", fields), csvResult);
}
Also used : SchemasUtils(org.apache.beam.examples.complete.datatokenization.utils.SchemasUtils) RowToCsv(org.apache.beam.examples.complete.datatokenization.utils.RowToCsv) Schema(org.apache.beam.sdk.schemas.Schema) ArrayList(java.util.ArrayList) Row(org.apache.beam.sdk.values.Row) Test(org.junit.Test)

Example 3 with RowToCsv

use of org.apache.beam.examples.complete.datatokenization.utils.RowToCsv in project beam by apache.

the class DataTokenization method run.

/**
 * Runs the pipeline to completion with the specified options.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
@SuppressWarnings({ "dereference.of.nullable", "argument.type.incompatible" })
public static PipelineResult run(DataTokenizationOptions options) {
    SchemasUtils schema = null;
    try {
        schema = new SchemasUtils(options.getDataSchemaPath(), StandardCharsets.UTF_8);
    } catch (IOException e) {
        LOG.error("Failed to retrieve schema for data.", e);
    }
    checkArgument(schema != null, "Data schema is mandatory.");
    // Create the pipeline
    Pipeline pipeline = Pipeline.create(options);
    // Register the coder for pipeline
    CoderRegistry coderRegistry = pipeline.getCoderRegistry();
    coderRegistry.registerCoderForType(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);
    coderRegistry.registerCoderForType(RowCoder.of(schema.getBeamSchema()).getEncodedTypeDescriptor(), RowCoder.of(schema.getBeamSchema()));
    /*
     * Row/Row Coder for FailsafeElement.
     */
    FailsafeElementCoder<Row, Row> coder = FailsafeElementCoder.of(RowCoder.of(schema.getBeamSchema()), RowCoder.of(schema.getBeamSchema()));
    coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);
    PCollection<Row> rows;
    if (options.getInputFilePattern() != null) {
        rows = new TokenizationFileSystemIO(options).read(pipeline, schema);
    } else if (options.getPubsubTopic() != null) {
        rows = pipeline.apply("ReadMessagesFromPubsub", PubsubIO.readStrings().fromTopic(options.getPubsubTopic())).apply("TransformToBeamRow", new JsonToBeamRow(options.getNonTokenizedDeadLetterPath(), schema));
        if (options.getOutputDirectory() != null) {
            rows = rows.apply(Window.into(FixedWindows.of(parseDuration(options.getWindowDuration()))));
        }
    } else {
        throw new IllegalStateException("No source is provided, please configure File System or Pub/Sub");
    }
    /*
    Tokenize data using remote API call
     */
    PCollectionTuple tokenizedRows = rows.setRowSchema(schema.getBeamSchema()).apply(MapElements.into(TypeDescriptors.kvs(TypeDescriptors.integers(), TypeDescriptors.rows())).via((Row row) -> KV.of(0, row))).setCoder(KvCoder.of(VarIntCoder.of(), RowCoder.of(schema.getBeamSchema()))).apply("DsgTokenization", RowToTokenizedRow.newBuilder().setBatchSize(options.getBatchSize()).setRpcURI(options.getRpcUri()).setSchema(schema.getBeamSchema()).setSuccessTag(TOKENIZATION_OUT).setFailureTag(TOKENIZATION_DEADLETTER_OUT).build());
    String csvDelimiter = options.getCsvDelimiter();
    if (options.getNonTokenizedDeadLetterPath() != null) {
        /*
      Write tokenization errors to dead-letter sink
       */
        tokenizedRows.get(TOKENIZATION_DEADLETTER_OUT).apply("ConvertToCSV", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via((FailsafeElement<Row, Row> fse) -> FailsafeElement.of(new RowToCsv(csvDelimiter).getCsvFromRow(fse.getOriginalPayload()), new RowToCsv(csvDelimiter).getCsvFromRow(fse.getPayload())))).apply("WriteTokenizationErrorsToFS", ErrorConverters.WriteErrorsToTextIO.<String, String>newBuilder().setErrorWritePath(options.getNonTokenizedDeadLetterPath()).setTranslateFunction(SerializableFunctions.getCsvErrorConverter()).build());
    }
    if (options.getOutputDirectory() != null) {
        new TokenizationFileSystemIO(options).write(tokenizedRows.get(TOKENIZATION_OUT), schema.getBeamSchema());
    } else if (options.getBigQueryTableName() != null) {
        WriteResult writeResult = TokenizationBigQueryIO.write(tokenizedRows.get(TOKENIZATION_OUT), options.getBigQueryTableName(), schema.getBigQuerySchema());
        writeResult.getFailedInsertsWithErr().apply("WrapInsertionErrors", MapElements.into(FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor()).via(TokenizationBigQueryIO::wrapBigQueryInsertError)).setCoder(FAILSAFE_ELEMENT_CODER).apply("WriteInsertionFailedRecords", ErrorConverters.WriteStringMessageErrors.newBuilder().setErrorRecordsTable(options.getBigQueryTableName() + DEFAULT_DEADLETTER_TABLE_SUFFIX).setErrorRecordsTableSchema(DEADLETTER_SCHEMA).build());
    } else if (options.getBigTableInstanceId() != null) {
        new TokenizationBigTableIO(options).write(tokenizedRows.get(TOKENIZATION_OUT), schema.getBeamSchema());
    } else {
        throw new IllegalStateException("No sink is provided, please configure BigQuery or BigTable.");
    }
    return pipeline.run();
}
Also used : TokenizationBigTableIO(org.apache.beam.examples.complete.datatokenization.transforms.io.TokenizationBigTableIO) SchemasUtils(org.apache.beam.examples.complete.datatokenization.utils.SchemasUtils) TokenizationFileSystemIO(org.apache.beam.examples.complete.datatokenization.transforms.io.TokenizationFileSystemIO) TokenizationBigQueryIO(org.apache.beam.examples.complete.datatokenization.transforms.io.TokenizationBigQueryIO) RowToCsv(org.apache.beam.examples.complete.datatokenization.utils.RowToCsv) IOException(java.io.IOException) Pipeline(org.apache.beam.sdk.Pipeline) FailsafeElement(org.apache.beam.examples.complete.datatokenization.utils.FailsafeElement) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) JsonToBeamRow(org.apache.beam.examples.complete.datatokenization.transforms.JsonToBeamRow) WriteResult(org.apache.beam.sdk.io.gcp.bigquery.WriteResult) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Row(org.apache.beam.sdk.values.Row) JsonToBeamRow(org.apache.beam.examples.complete.datatokenization.transforms.JsonToBeamRow) RowToTokenizedRow(org.apache.beam.examples.complete.datatokenization.transforms.DataProtectors.RowToTokenizedRow)

Example 4 with RowToCsv

use of org.apache.beam.examples.complete.datatokenization.utils.RowToCsv in project beam by apache.

the class TokenizationFileSystemIO method writeCsv.

private PDone writeCsv(PCollection<Row> input, Schema schema) {
    String header = String.join(options.getCsvDelimiter(), schema.getFieldNames());
    String csvDelimiter = options.getCsvDelimiter();
    PCollection<String> csvs = input.apply("ConvertToCSV", MapElements.into(TypeDescriptors.strings()).via((Row inputRow) -> new RowToCsv(csvDelimiter).getCsvFromRow(inputRow)));
    if (csvs.isBounded() == IsBounded.BOUNDED) {
        return csvs.apply("WriteToFS", TextIO.write().to(options.getOutputDirectory()).withHeader(header));
    } else {
        return csvs.apply("WriteToFS", TextIO.write().withWindowedWrites().withNumShards(1).to(options.getOutputDirectory()).withHeader(header));
    }
}
Also used : RowToCsv(org.apache.beam.examples.complete.datatokenization.utils.RowToCsv) JsonToBeamRow(org.apache.beam.examples.complete.datatokenization.transforms.JsonToBeamRow) Row(org.apache.beam.sdk.values.Row)

Aggregations

RowToCsv (org.apache.beam.examples.complete.datatokenization.utils.RowToCsv)4 Row (org.apache.beam.sdk.values.Row)4 SchemasUtils (org.apache.beam.examples.complete.datatokenization.utils.SchemasUtils)3 JsonToBeamRow (org.apache.beam.examples.complete.datatokenization.transforms.JsonToBeamRow)2 Schema (org.apache.beam.sdk.schemas.Schema)2 Test (org.junit.Test)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 RowToTokenizedRow (org.apache.beam.examples.complete.datatokenization.transforms.DataProtectors.RowToTokenizedRow)1 TokenizationBigQueryIO (org.apache.beam.examples.complete.datatokenization.transforms.io.TokenizationBigQueryIO)1 TokenizationBigTableIO (org.apache.beam.examples.complete.datatokenization.transforms.io.TokenizationBigTableIO)1 TokenizationFileSystemIO (org.apache.beam.examples.complete.datatokenization.transforms.io.TokenizationFileSystemIO)1 FailsafeElement (org.apache.beam.examples.complete.datatokenization.utils.FailsafeElement)1 Pipeline (org.apache.beam.sdk.Pipeline)1 CoderRegistry (org.apache.beam.sdk.coders.CoderRegistry)1 WriteResult (org.apache.beam.sdk.io.gcp.bigquery.WriteResult)1 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)1