use of com.google.api.services.bigquery.model.TableRow in project beam by apache.
the class BigqueryMatcher method formatRows.
private String formatRows(int totalNumRows) {
StringBuilder samples = new StringBuilder();
List<TableRow> rows = response.getRows();
for (int i = 0; i < totalNumRows && i < rows.size(); i++) {
samples.append(String.format("%n\t\t"));
for (TableCell field : rows.get(i).getF()) {
samples.append(String.format("%-10s", field.getV()));
}
}
if (rows.size() > totalNumRows) {
samples.append(String.format("%n\t\t..."));
}
return samples.toString();
}
use of com.google.api.services.bigquery.model.TableRow in project beam by apache.
the class BigqueryMatcher method generateHash.
private String generateHash(@Nonnull List<TableRow> rows) {
List<HashCode> rowHashes = Lists.newArrayList();
for (TableRow row : rows) {
List<String> cellsInOneRow = Lists.newArrayList();
for (TableCell cell : row.getF()) {
cellsInOneRow.add(Objects.toString(cell.getV()));
Collections.sort(cellsInOneRow);
}
rowHashes.add(Hashing.sha1().hashString(cellsInOneRow.toString(), StandardCharsets.UTF_8));
}
return Hashing.combineUnordered(rowHashes).toString();
}
use of com.google.api.services.bigquery.model.TableRow in project beam by apache.
the class BigqueryMatcherTest method createResponseContainingTestData.
private QueryResponse createResponseContainingTestData() {
TableCell field1 = new TableCell();
field1.setV("abc");
TableCell field2 = new TableCell();
field2.setV("2");
TableCell field3 = new TableCell();
field3.setV("testing BigQuery matcher.");
TableRow row = new TableRow();
row.setF(Lists.newArrayList(field1, field2, field3));
QueryResponse response = new QueryResponse();
response.setJobComplete(true);
response.setRows(Lists.newArrayList(row));
response.setTotalRows(BigInteger.ONE);
return response;
}
use of com.google.api.services.bigquery.model.TableRow in project beam by apache.
the class BatchLoads method expand.
@Override
public WriteResult expand(PCollection<KV<DestinationT, TableRow>> input) {
Pipeline p = input.getPipeline();
final String stepUuid = BigQueryHelpers.randomUUIDString();
PCollectionView<String> tempFilePrefix = p.apply("Create", Create.of((Void) null)).apply("GetTempFilePrefix", ParDo.of(new DoFn<Void, String>() {
@ProcessElement
public void getTempFilePrefix(ProcessContext c) {
c.output(resolveTempLocation(c.getPipelineOptions().getTempLocation(), "BigQueryWriteTemp", stepUuid));
}
})).apply("TempFilePrefixView", View.<String>asSingleton());
// Create a singleton job ID token at execution time. This will be used as the base for all
// load jobs issued from this instance of the transform.
PCollectionView<String> jobIdTokenView = p.apply("TriggerIdCreation", Create.of("ignored")).apply("CreateJobId", MapElements.via(new SimpleFunction<String, String>() {
@Override
public String apply(String input) {
return stepUuid;
}
})).apply(View.<String>asSingleton());
PCollection<KV<DestinationT, TableRow>> inputInGlobalWindow = input.apply("rewindowIntoGlobal", Window.<KV<DestinationT, TableRow>>into(new GlobalWindows()).triggering(DefaultTrigger.of()).discardingFiredPanes());
PCollectionView<Map<DestinationT, String>> schemasView = inputInGlobalWindow.apply(new CalculateSchemas<>(dynamicDestinations));
TupleTag<WriteBundlesToFiles.Result<DestinationT>> writtenFilesTag = new TupleTag<WriteBundlesToFiles.Result<DestinationT>>("writtenFiles") {
};
TupleTag<KV<ShardedKey<DestinationT>, TableRow>> unwrittedRecordsTag = new TupleTag<KV<ShardedKey<DestinationT>, TableRow>>("unwrittenRecords") {
};
PCollectionTuple writeBundlesTuple = inputInGlobalWindow.apply("WriteBundlesToFiles", ParDo.of(new WriteBundlesToFiles<>(stepUuid, unwrittedRecordsTag, maxNumWritersPerBundle, maxFileSize)).withOutputTags(writtenFilesTag, TupleTagList.of(unwrittedRecordsTag)));
PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFiles = writeBundlesTuple.get(writtenFilesTag).setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder));
// If the bundles contain too many output tables to be written inline to files (due to memory
// limits), any unwritten records will be spilled to the unwrittenRecordsTag PCollection.
// Group these records by key, and write the files after grouping. Since the record is grouped
// by key, we can ensure that only one file is open at a time in each bundle.
PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFilesGrouped = writeBundlesTuple.get(unwrittedRecordsTag).setCoder(KvCoder.of(ShardedKeyCoder.of(destinationCoder), TableRowJsonCoder.of())).apply(GroupByKey.<ShardedKey<DestinationT>, TableRow>create()).apply(ParDo.of(new WriteGroupedRecordsToFiles<DestinationT>(tempFilePrefix, maxFileSize)).withSideInputs(tempFilePrefix)).setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder));
// PCollection of filename, file byte size, and table destination.
PCollection<WriteBundlesToFiles.Result<DestinationT>> results = PCollectionList.of(writtenFiles).and(writtenFilesGrouped).apply(Flatten.<Result<DestinationT>>pCollections());
TupleTag<KV<ShardedKey<DestinationT>, List<String>>> multiPartitionsTag = new TupleTag<KV<ShardedKey<DestinationT>, List<String>>>("multiPartitionsTag") {
};
TupleTag<KV<ShardedKey<DestinationT>, List<String>>> singlePartitionTag = new TupleTag<KV<ShardedKey<DestinationT>, List<String>>>("singlePartitionTag") {
};
// Turn the list of files and record counts in a PCollectionView that can be used as a
// side input.
PCollectionView<Iterable<WriteBundlesToFiles.Result<DestinationT>>> resultsView = results.apply("ResultsView", View.<WriteBundlesToFiles.Result<DestinationT>>asIterable());
// This transform will look at the set of files written for each table, and if any table has
// too many files or bytes, will partition that table's files into multiple partitions for
// loading.
PCollection<Void> singleton = p.apply("singleton", Create.of((Void) null).withCoder(VoidCoder.of()));
PCollectionTuple partitions = singleton.apply("WritePartition", ParDo.of(new WritePartition<>(singletonTable, tempFilePrefix, resultsView, multiPartitionsTag, singlePartitionTag)).withSideInputs(tempFilePrefix, resultsView).withOutputTags(multiPartitionsTag, TupleTagList.of(singlePartitionTag)));
List<PCollectionView<?>> writeTablesSideInputs = Lists.newArrayList(jobIdTokenView, schemasView);
writeTablesSideInputs.addAll(dynamicDestinations.getSideInputs());
Coder<KV<ShardedKey<DestinationT>, List<String>>> partitionsCoder = KvCoder.of(ShardedKeyCoder.of(NullableCoder.of(destinationCoder)), ListCoder.of(StringUtf8Coder.of()));
// If WriteBundlesToFiles produced more than MAX_NUM_FILES files or MAX_SIZE_BYTES bytes, then
// the import needs to be split into multiple partitions, and those partitions will be
// specified in multiPartitionsTag.
PCollection<KV<TableDestination, String>> tempTables = partitions.get(multiPartitionsTag).setCoder(partitionsCoder).apply("MultiPartitionsReshuffle", Reshuffle.<ShardedKey<DestinationT>, List<String>>of()).apply("MultiPartitionsWriteTables", ParDo.of(new WriteTables<>(false, bigQueryServices, jobIdTokenView, schemasView, WriteDisposition.WRITE_EMPTY, CreateDisposition.CREATE_IF_NEEDED, dynamicDestinations)).withSideInputs(writeTablesSideInputs));
// This view maps each final table destination to the set of temporary partitioned tables
// the PCollection was loaded into.
PCollectionView<Map<TableDestination, Iterable<String>>> tempTablesView = tempTables.apply("TempTablesView", View.<TableDestination, String>asMultimap());
singleton.apply("WriteRename", ParDo.of(new WriteRename(bigQueryServices, jobIdTokenView, writeDisposition, createDisposition, tempTablesView)).withSideInputs(tempTablesView, jobIdTokenView));
// Write single partition to final table
partitions.get(singlePartitionTag).setCoder(partitionsCoder).apply("SinglePartitionsReshuffle", Reshuffle.<ShardedKey<DestinationT>, List<String>>of()).apply("SinglePartitionWriteTables", ParDo.of(new WriteTables<>(true, bigQueryServices, jobIdTokenView, schemasView, writeDisposition, createDisposition, dynamicDestinations)).withSideInputs(writeTablesSideInputs));
PCollection<TableRow> empty = p.apply("CreateEmptyFailedInserts", Create.empty(TypeDescriptor.of(TableRow.class)));
return WriteResult.in(input.getPipeline(), new TupleTag<TableRow>("failedInserts"), empty);
}
use of com.google.api.services.bigquery.model.TableRow in project beam by apache.
the class BigQueryTableRowIterator method advance.
boolean advance() throws IOException, InterruptedException {
while (true) {
if (iteratorOverCurrentBatch != null && iteratorOverCurrentBatch.hasNext()) {
// Embed schema information into the raw row, so that values have an
// associated key.
current = getTypedTableRow(schema.getFields(), iteratorOverCurrentBatch.next());
return true;
}
if (lastPage) {
return false;
}
Bigquery.Tabledata.List list = client.tabledata().list(ref.getProjectId(), ref.getDatasetId(), ref.getTableId());
if (pageToken != null) {
list.setPageToken(pageToken);
}
TableDataList result = executeWithBackOff(list, String.format("Error reading from BigQuery table %s of dataset %s.", ref.getTableId(), ref.getDatasetId()));
pageToken = result.getPageToken();
iteratorOverCurrentBatch = result.getRows() != null ? result.getRows().iterator() : Collections.<TableRow>emptyIterator();
// The server may return a page token indefinitely on a zero-length table.
if (pageToken == null || result.getTotalRows() != null && result.getTotalRows() == 0) {
lastPage = true;
}
}
}
Aggregations