Search in sources :

Example 1 with Result

use of org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result in project beam by apache.

the class BatchLoads method expand.

@Override
public WriteResult expand(PCollection<KV<DestinationT, TableRow>> input) {
    Pipeline p = input.getPipeline();
    final String stepUuid = BigQueryHelpers.randomUUIDString();
    PCollectionView<String> tempFilePrefix = p.apply("Create", Create.of((Void) null)).apply("GetTempFilePrefix", ParDo.of(new DoFn<Void, String>() {

        @ProcessElement
        public void getTempFilePrefix(ProcessContext c) {
            c.output(resolveTempLocation(c.getPipelineOptions().getTempLocation(), "BigQueryWriteTemp", stepUuid));
        }
    })).apply("TempFilePrefixView", View.<String>asSingleton());
    // Create a singleton job ID token at execution time. This will be used as the base for all
    // load jobs issued from this instance of the transform.
    PCollectionView<String> jobIdTokenView = p.apply("TriggerIdCreation", Create.of("ignored")).apply("CreateJobId", MapElements.via(new SimpleFunction<String, String>() {

        @Override
        public String apply(String input) {
            return stepUuid;
        }
    })).apply(View.<String>asSingleton());
    PCollection<KV<DestinationT, TableRow>> inputInGlobalWindow = input.apply("rewindowIntoGlobal", Window.<KV<DestinationT, TableRow>>into(new GlobalWindows()).triggering(DefaultTrigger.of()).discardingFiredPanes());
    PCollectionView<Map<DestinationT, String>> schemasView = inputInGlobalWindow.apply(new CalculateSchemas<>(dynamicDestinations));
    TupleTag<WriteBundlesToFiles.Result<DestinationT>> writtenFilesTag = new TupleTag<WriteBundlesToFiles.Result<DestinationT>>("writtenFiles") {
    };
    TupleTag<KV<ShardedKey<DestinationT>, TableRow>> unwrittedRecordsTag = new TupleTag<KV<ShardedKey<DestinationT>, TableRow>>("unwrittenRecords") {
    };
    PCollectionTuple writeBundlesTuple = inputInGlobalWindow.apply("WriteBundlesToFiles", ParDo.of(new WriteBundlesToFiles<>(stepUuid, unwrittedRecordsTag, maxNumWritersPerBundle, maxFileSize)).withOutputTags(writtenFilesTag, TupleTagList.of(unwrittedRecordsTag)));
    PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFiles = writeBundlesTuple.get(writtenFilesTag).setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder));
    // If the bundles contain too many output tables to be written inline to files (due to memory
    // limits), any unwritten records will be spilled to the unwrittenRecordsTag PCollection.
    // Group these records by key, and write the files after grouping. Since the record is grouped
    // by key, we can ensure that only one file is open at a time in each bundle.
    PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFilesGrouped = writeBundlesTuple.get(unwrittedRecordsTag).setCoder(KvCoder.of(ShardedKeyCoder.of(destinationCoder), TableRowJsonCoder.of())).apply(GroupByKey.<ShardedKey<DestinationT>, TableRow>create()).apply(ParDo.of(new WriteGroupedRecordsToFiles<DestinationT>(tempFilePrefix, maxFileSize)).withSideInputs(tempFilePrefix)).setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder));
    // PCollection of filename, file byte size, and table destination.
    PCollection<WriteBundlesToFiles.Result<DestinationT>> results = PCollectionList.of(writtenFiles).and(writtenFilesGrouped).apply(Flatten.<Result<DestinationT>>pCollections());
    TupleTag<KV<ShardedKey<DestinationT>, List<String>>> multiPartitionsTag = new TupleTag<KV<ShardedKey<DestinationT>, List<String>>>("multiPartitionsTag") {
    };
    TupleTag<KV<ShardedKey<DestinationT>, List<String>>> singlePartitionTag = new TupleTag<KV<ShardedKey<DestinationT>, List<String>>>("singlePartitionTag") {
    };
    // Turn the list of files and record counts in a PCollectionView that can be used as a
    // side input.
    PCollectionView<Iterable<WriteBundlesToFiles.Result<DestinationT>>> resultsView = results.apply("ResultsView", View.<WriteBundlesToFiles.Result<DestinationT>>asIterable());
    // This transform will look at the set of files written for each table, and if any table has
    // too many files or bytes, will partition that table's files into multiple partitions for
    // loading.
    PCollection<Void> singleton = p.apply("singleton", Create.of((Void) null).withCoder(VoidCoder.of()));
    PCollectionTuple partitions = singleton.apply("WritePartition", ParDo.of(new WritePartition<>(singletonTable, tempFilePrefix, resultsView, multiPartitionsTag, singlePartitionTag)).withSideInputs(tempFilePrefix, resultsView).withOutputTags(multiPartitionsTag, TupleTagList.of(singlePartitionTag)));
    List<PCollectionView<?>> writeTablesSideInputs = Lists.newArrayList(jobIdTokenView, schemasView);
    writeTablesSideInputs.addAll(dynamicDestinations.getSideInputs());
    Coder<KV<ShardedKey<DestinationT>, List<String>>> partitionsCoder = KvCoder.of(ShardedKeyCoder.of(NullableCoder.of(destinationCoder)), ListCoder.of(StringUtf8Coder.of()));
    // If WriteBundlesToFiles produced more than MAX_NUM_FILES files or MAX_SIZE_BYTES bytes, then
    // the import needs to be split into multiple partitions, and those partitions will be
    // specified in multiPartitionsTag.
    PCollection<KV<TableDestination, String>> tempTables = partitions.get(multiPartitionsTag).setCoder(partitionsCoder).apply("MultiPartitionsReshuffle", Reshuffle.<ShardedKey<DestinationT>, List<String>>of()).apply("MultiPartitionsWriteTables", ParDo.of(new WriteTables<>(false, bigQueryServices, jobIdTokenView, schemasView, WriteDisposition.WRITE_EMPTY, CreateDisposition.CREATE_IF_NEEDED, dynamicDestinations)).withSideInputs(writeTablesSideInputs));
    // This view maps each final table destination to the set of temporary partitioned tables
    // the PCollection was loaded into.
    PCollectionView<Map<TableDestination, Iterable<String>>> tempTablesView = tempTables.apply("TempTablesView", View.<TableDestination, String>asMultimap());
    singleton.apply("WriteRename", ParDo.of(new WriteRename(bigQueryServices, jobIdTokenView, writeDisposition, createDisposition, tempTablesView)).withSideInputs(tempTablesView, jobIdTokenView));
    // Write single partition to final table
    partitions.get(singlePartitionTag).setCoder(partitionsCoder).apply("SinglePartitionsReshuffle", Reshuffle.<ShardedKey<DestinationT>, List<String>>of()).apply("SinglePartitionWriteTables", ParDo.of(new WriteTables<>(true, bigQueryServices, jobIdTokenView, schemasView, writeDisposition, createDisposition, dynamicDestinations)).withSideInputs(writeTablesSideInputs));
    PCollection<TableRow> empty = p.apply("CreateEmptyFailedInserts", Create.empty(TypeDescriptor.of(TableRow.class)));
    return WriteResult.in(input.getPipeline(), new TupleTag<TableRow>("failedInserts"), empty);
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) Result(org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) TupleTagList(org.apache.beam.sdk.values.TupleTagList) PCollectionList(org.apache.beam.sdk.values.PCollectionList) List(java.util.List) GlobalWindows(org.apache.beam.sdk.transforms.windowing.GlobalWindows) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) PCollectionView(org.apache.beam.sdk.values.PCollectionView) TableRow(com.google.api.services.bigquery.model.TableRow) Map(java.util.Map)

Example 2 with Result

use of org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result in project beam by apache.

the class BigQueryIOTest method testWritePartition.

private void testWritePartition(long numTables, long numFilesPerTable, long fileSize, long expectedNumPartitionsPerTable) throws Exception {
    p.enableAbandonedNodeEnforcement(false);
    // In the case where a static destination is specified (i.e. not through a dynamic table
    // function) and there is no input data, WritePartition will generate an empty table. This
    // code is to test that path.
    boolean isSingleton = numTables == 1 && numFilesPerTable == 0;
    List<ShardedKey<String>> expectedPartitions = Lists.newArrayList();
    if (isSingleton) {
        expectedPartitions.add(ShardedKey.<String>of(null, 1));
    } else {
        for (int i = 0; i < numTables; ++i) {
            for (int j = 1; j <= expectedNumPartitionsPerTable; ++j) {
                String tableName = String.format("project-id:dataset-id.tables%05d", i);
                expectedPartitions.add(ShardedKey.of(tableName, j));
            }
        }
    }
    List<WriteBundlesToFiles.Result<String>> files = Lists.newArrayList();
    Map<String, List<String>> filenamesPerTable = Maps.newHashMap();
    for (int i = 0; i < numTables; ++i) {
        String tableName = String.format("project-id:dataset-id.tables%05d", i);
        List<String> filenames = filenamesPerTable.get(tableName);
        if (filenames == null) {
            filenames = Lists.newArrayList();
            filenamesPerTable.put(tableName, filenames);
        }
        for (int j = 0; j < numFilesPerTable; ++j) {
            String fileName = String.format("%s_files%05d", tableName, j);
            filenames.add(fileName);
            files.add(new Result<>(fileName, fileSize, tableName));
        }
    }
    TupleTag<KV<ShardedKey<String>, List<String>>> multiPartitionsTag = new TupleTag<KV<ShardedKey<String>, List<String>>>("multiPartitionsTag") {
    };
    TupleTag<KV<ShardedKey<String>, List<String>>> singlePartitionTag = new TupleTag<KV<ShardedKey<String>, List<String>>>("singlePartitionTag") {
    };
    PCollectionView<Iterable<WriteBundlesToFiles.Result<String>>> resultsView = p.apply(Create.of(files).withCoder(WriteBundlesToFiles.ResultCoder.of(StringUtf8Coder.of()))).apply(View.<WriteBundlesToFiles.Result<String>>asIterable());
    String tempFilePrefix = testFolder.newFolder("BigQueryIOTest").getAbsolutePath();
    PCollectionView<String> tempFilePrefixView = p.apply(Create.of(tempFilePrefix)).apply(View.<String>asSingleton());
    WritePartition<String> writePartition = new WritePartition<>(isSingleton, tempFilePrefixView, resultsView, multiPartitionsTag, singlePartitionTag);
    DoFnTester<Void, KV<ShardedKey<String>, List<String>>> tester = DoFnTester.of(writePartition);
    tester.setSideInput(resultsView, GlobalWindow.INSTANCE, files);
    tester.setSideInput(tempFilePrefixView, GlobalWindow.INSTANCE, tempFilePrefix);
    tester.processElement(null);
    List<KV<ShardedKey<String>, List<String>>> partitions;
    if (expectedNumPartitionsPerTable > 1) {
        partitions = tester.takeOutputElements(multiPartitionsTag);
    } else {
        partitions = tester.takeOutputElements(singlePartitionTag);
    }
    List<ShardedKey<String>> partitionsResult = Lists.newArrayList();
    Map<String, List<String>> filesPerTableResult = Maps.newHashMap();
    for (KV<ShardedKey<String>, List<String>> partition : partitions) {
        String table = partition.getKey().getKey();
        partitionsResult.add(partition.getKey());
        List<String> tableFilesResult = filesPerTableResult.get(table);
        if (tableFilesResult == null) {
            tableFilesResult = Lists.newArrayList();
            filesPerTableResult.put(table, tableFilesResult);
        }
        tableFilesResult.addAll(partition.getValue());
    }
    assertThat(partitionsResult, containsInAnyOrder(Iterables.toArray(expectedPartitions, ShardedKey.class)));
    if (isSingleton) {
        assertEquals(1, filesPerTableResult.size());
        List<String> singletonFiles = filesPerTableResult.values().iterator().next();
        assertTrue(Files.exists(Paths.get(singletonFiles.get(0))));
        assertThat(Files.readAllBytes(Paths.get(singletonFiles.get(0))).length, Matchers.equalTo(0));
    } else {
        assertEquals(filenamesPerTable, filesPerTableResult);
    }
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) KV(org.apache.beam.sdk.values.KV) Result(org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList)

Aggregations

List (java.util.List)2 Result (org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result)2 KV (org.apache.beam.sdk.values.KV)2 TupleTag (org.apache.beam.sdk.values.TupleTag)2 TableRow (com.google.api.services.bigquery.model.TableRow)1 ImmutableList (com.google.common.collect.ImmutableList)1 ArrayList (java.util.ArrayList)1 Map (java.util.Map)1 Pipeline (org.apache.beam.sdk.Pipeline)1 BigQueryHelpers.toJsonString (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString)1 GlobalWindows (org.apache.beam.sdk.transforms.windowing.GlobalWindows)1 PCollectionList (org.apache.beam.sdk.values.PCollectionList)1 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)1 PCollectionView (org.apache.beam.sdk.values.PCollectionView)1 TupleTagList (org.apache.beam.sdk.values.TupleTagList)1