use of org.apache.beam.sdk.io.fs.ResourceId in project DataflowJavaSDK-examples by GoogleCloudPlatform.
the class WriteOneFilePerWindow method expand.
@Override
public PDone expand(PCollection<String> input) {
// filenamePrefix may contain a directory and a filename component. Pull out only the filename
// component from that path for the PerWindowFiles.
String prefix = "";
ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix);
if (!resource.isDirectory()) {
prefix = verifyNotNull(resource.getFilename(), "A non-directory resource should have a non-null filename: %s", resource);
}
TextIO.Write write = TextIO.write().to(resource.getCurrentDirectory()).withFilenamePolicy(new PerWindowFiles(prefix)).withWindowedWrites();
if (numShards != null) {
write = write.withNumShards(numShards);
}
return input.apply(write);
}
use of org.apache.beam.sdk.io.fs.ResourceId in project beam by apache.
the class PTransformMatchersTest method writeWithRunnerDeterminedSharding.
@Test
public void writeWithRunnerDeterminedSharding() {
ResourceId outputDirectory = LocalResources.fromString("/foo/bar", true);
FilenamePolicy policy = DefaultFilenamePolicy.constructUsingStandardParameters(StaticValueProvider.of(outputDirectory), DefaultFilenamePolicy.DEFAULT_SHARD_TEMPLATE, "");
WriteFiles<Integer> write = WriteFiles.to(new FileBasedSink<Integer>(StaticValueProvider.of(outputDirectory), policy) {
@Override
public WriteOperation<Integer> createWriteOperation() {
return null;
}
});
assertThat(PTransformMatchers.writeWithRunnerDeterminedSharding().matches(appliedWrite(write)), is(true));
WriteFiles<Integer> withStaticSharding = write.withNumShards(3);
assertThat(PTransformMatchers.writeWithRunnerDeterminedSharding().matches(appliedWrite(withStaticSharding)), is(false));
WriteFiles<Integer> withCustomSharding = write.withSharding(Sum.integersGlobally().asSingletonView());
assertThat(PTransformMatchers.writeWithRunnerDeterminedSharding().matches(appliedWrite(withCustomSharding)), is(false));
}
use of org.apache.beam.sdk.io.fs.ResourceId in project beam by apache.
the class BigQueryIO method getExtractFilePaths.
static List<ResourceId> getExtractFilePaths(String extractDestinationDir, Job extractJob) throws IOException {
JobStatistics jobStats = extractJob.getStatistics();
List<Long> counts = jobStats.getExtract().getDestinationUriFileCounts();
if (counts.size() != 1) {
String errorMessage = (counts.size() == 0 ? "No destination uri file count received." : String.format("More than one destination uri file count received. First two are %s, %s", counts.get(0), counts.get(1)));
throw new RuntimeException(errorMessage);
}
long filesCount = counts.get(0);
ImmutableList.Builder<ResourceId> paths = ImmutableList.builder();
ResourceId extractDestinationDirResourceId = FileSystems.matchNewResource(extractDestinationDir, true);
for (long i = 0; i < filesCount; ++i) {
ResourceId filePath = extractDestinationDirResourceId.resolve(String.format("%012d%s", i, ".avro"), ResolveOptions.StandardResolveOptions.RESOLVE_FILE);
paths.add(filePath);
}
return paths.build();
}
use of org.apache.beam.sdk.io.fs.ResourceId in project beam by apache.
the class BigQuerySourceBase method split.
@Override
public List<BoundedSource<TableRow>> split(long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
// another BigQuery extract job for the repeated split() calls.
if (cachedSplitResult == null) {
BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
TableReference tableToExtract = getTableToExtract(bqOptions);
JobService jobService = bqServices.getJobService(bqOptions);
final String extractDestinationDir = resolveTempLocation(bqOptions.getTempLocation(), "BigQueryExtractTemp", stepUuid);
String extractJobId = getExtractJobId(createJobIdToken(options.getJobName(), stepUuid));
List<ResourceId> tempFiles = executeExtract(extractJobId, tableToExtract, jobService, bqOptions.getProject(), extractDestinationDir);
TableSchema tableSchema = bqServices.getDatasetService(bqOptions).getTable(tableToExtract).getSchema();
cleanupTempResource(bqOptions);
cachedSplitResult = checkNotNull(createSources(tempFiles, tableSchema));
}
return cachedSplitResult;
}
use of org.apache.beam.sdk.io.fs.ResourceId in project beam by apache.
the class BigQuerySourceBase method createSources.
private List<BoundedSource<TableRow>> createSources(List<ResourceId> files, TableSchema tableSchema) throws IOException, InterruptedException {
final String jsonSchema = BigQueryIO.JSON_FACTORY.toString(tableSchema);
SerializableFunction<GenericRecord, TableRow> function = new SerializableFunction<GenericRecord, TableRow>() {
@Override
public TableRow apply(GenericRecord input) {
return BigQueryAvroUtils.convertGenericRecordToTableRow(input, BigQueryHelpers.fromJsonString(jsonSchema, TableSchema.class));
}
};
List<BoundedSource<TableRow>> avroSources = Lists.newArrayList();
for (ResourceId file : files) {
avroSources.add(new TransformingSource<>(AvroSource.from(file.toString()), function, getDefaultOutputCoder()));
}
return ImmutableList.copyOf(avroSources);
}
Aggregations