use of org.apache.beam.sdk.util.NumberedShardedFile in project beam by apache.
the class TfIdfIT method testE2ETfIdf.
@Test
public void testE2ETfIdf() throws Exception {
TfIdfITOptions options = TestPipeline.testingPipelineOptions().as(TfIdfITOptions.class);
options.setInput(DEFAULT_INPUT);
options.setOutput(FileSystems.matchNewResource(options.getTempRoot(), true).resolve(String.format("TfIdfIT-%tF-%<tH-%<tM-%<tS-%<tL", new Date()), StandardResolveOptions.RESOLVE_DIRECTORY).resolve("output", StandardResolveOptions.RESOLVE_DIRECTORY).resolve("results", StandardResolveOptions.RESOLVE_FILE).toString());
TfIdf.runTfIdf(options);
assertThat(new NumberedShardedFile(options.getOutput() + "*-of-*.csv", DEFAULT_SHARD_TEMPLATE), fileContentsHaveChecksum(EXPECTED_OUTPUT_CHECKSUM));
}
use of org.apache.beam.sdk.util.NumberedShardedFile in project beam by apache.
the class WindowedWordCountIT method testWindowedWordCountPipeline.
private void testWindowedWordCountPipeline(WindowedWordCountITOptions options) throws Exception {
ResourceId output = FileBasedSink.convertToFileResourceIfPossible(options.getOutput());
PerWindowFiles filenamePolicy = new PerWindowFiles(output);
List<ShardedFile> expectedOutputFiles = Lists.newArrayListWithCapacity(6);
for (int startMinute : ImmutableList.of(0, 10, 20, 30, 40, 50)) {
final Instant windowStart = new Instant(options.getMinTimestampMillis()).plus(Duration.standardMinutes(startMinute));
String filePrefix = filenamePolicy.filenamePrefixForWindow(new IntervalWindow(windowStart, windowStart.plus(Duration.standardMinutes(10))));
expectedOutputFiles.add(new NumberedShardedFile(output.getCurrentDirectory().resolve(filePrefix, StandardResolveOptions.RESOLVE_FILE).toString() + "*"));
}
ShardedFile inputFile = new ExplicitShardedFile(Collections.singleton(options.getInputFile()));
// For this integration test, input is tiny and we can build the expected counts
SortedMap<String, Long> expectedWordCounts = new TreeMap<>();
for (String line : inputFile.readFilesWithRetries(Sleeper.DEFAULT, BACK_OFF_FACTORY.backoff())) {
String[] words = line.split(ExampleUtils.TOKENIZER_PATTERN, -1);
for (String word : words) {
if (!word.isEmpty()) {
expectedWordCounts.put(word, MoreObjects.firstNonNull(expectedWordCounts.get(word), 0L) + 1L);
}
}
}
WindowedWordCount.runWindowedWordCount(options);
assertThat(expectedOutputFiles, containsWordCounts(expectedWordCounts));
}
use of org.apache.beam.sdk.util.NumberedShardedFile in project beam by apache.
the class FileChecksumMatcherTest method testMatcherThatVerifiesMultipleFiles.
@Test
public void testMatcherThatVerifiesMultipleFiles() throws IOException {
// TODO: Java core test failing on windows, https://issues.apache.org/jira/browse/BEAM-10747
assumeFalse(SystemUtils.IS_OS_WINDOWS);
File tmpFile1 = tmpFolder.newFile("result-000-of-002");
File tmpFile2 = tmpFolder.newFile("result-001-of-002");
File tmpFile3 = tmpFolder.newFile("tmp");
Files.write("To be or not to be, ", tmpFile1, StandardCharsets.UTF_8);
Files.write("it is not a question.", tmpFile2, StandardCharsets.UTF_8);
Files.write("tmp", tmpFile3, StandardCharsets.UTF_8);
assertThat(new NumberedShardedFile(tmpFolder.getRoot().toPath().resolve("result-*").toString()), fileContentsHaveChecksum("90552392c28396935fe4f123bd0b5c2d0f6260c8"));
}
use of org.apache.beam.sdk.util.NumberedShardedFile in project beam by apache.
the class FileChecksumMatcherTest method testMatcherThatUsesCustomizedTemplate.
@Test
public void testMatcherThatUsesCustomizedTemplate() throws Exception {
// Customized template: resultSSS-totalNNN
// TODO: Java core test failing on windows, https://issues.apache.org/jira/browse/BEAM-10749
assumeFalse(SystemUtils.IS_OS_WINDOWS);
File tmpFile1 = tmpFolder.newFile("result0-total2");
File tmpFile2 = tmpFolder.newFile("result1-total2");
Files.write("To be or not to be, ", tmpFile1, StandardCharsets.UTF_8);
Files.write("it is not a question.", tmpFile2, StandardCharsets.UTF_8);
Pattern customizedTemplate = Pattern.compile("(?x) result (?<shardnum>\\d+) - total (?<numshards>\\d+)");
assertThat(new NumberedShardedFile(tmpFolder.getRoot().toPath().resolve("*").toString(), customizedTemplate), fileContentsHaveChecksum("90552392c28396935fe4f123bd0b5c2d0f6260c8"));
}
Aggregations