use of org.apache.beam.sdk.io.range.OffsetRange in project beam by apache.
the class OutputAndTimeBoundedSplittableProcessElementInvokerTest method testInvokeProcessElementTimeBounded.
@Test
public void testInvokeProcessElementTimeBounded() throws Exception {
SplittableProcessElementInvoker<Void, String, OffsetRange, Long, Void>.Result res = runTest(10000, Duration.ZERO, Integer.MAX_VALUE, Duration.millis(100));
assertFalse(res.getContinuation().shouldResume());
OffsetRange residualRange = res.getResidualRestriction();
// Should process ideally around 30 elements - but due to timing flakiness, we can't enforce
// that precisely. Just test that it's not egregiously off.
assertThat(residualRange.getFrom(), greaterThan(10L));
assertThat(residualRange.getFrom(), lessThan(100L));
assertEquals(10000, residualRange.getTo());
}
use of org.apache.beam.sdk.io.range.OffsetRange in project beam by apache.
the class TextRowCountEstimator method estimateRowCount.
/**
* Estimates the number of non empty rows. It samples NumSampledBytesPerFile bytes from every file
* until the condition in sampling strategy is met. Then it takes the average line size of the
* rows and divides the total file sizes by that number. If all the sampled rows are empty, and it
* has not sampled all the lines (due to sampling strategy) it throws Exception.
*
* @return Number of estimated rows.
* @throws org.apache.beam.sdk.io.TextRowCountEstimator.NoEstimationException if all the sampled
* lines are empty and we have not read all the lines in the matched files.
*/
@SuppressFBWarnings(value = "RCN_REDUNDANT_NULLCHECK_WOULD_HAVE_BEEN_A_NPE", justification = "https://github.com/spotbugs/spotbugs/issues/756")
public Double estimateRowCount(PipelineOptions pipelineOptions) throws IOException, NoEstimationException {
long linesSize = 0;
int numberOfReadLines = 0;
long totalFileSizes = 0;
long totalSampledBytes = 0;
int numberOfReadFiles = 0;
boolean sampledEverything = true;
MatchResult match = FileSystems.match(getFilePattern(), getEmptyMatchTreatment());
for (MatchResult.Metadata metadata : match.metadata()) {
if (getSamplingStrategy().stopSampling(numberOfReadFiles, totalSampledBytes)) {
sampledEverything = false;
break;
}
if (FileIO.ReadMatches.shouldSkipDirectory(metadata, getDirectoryTreatment())) {
continue;
}
FileIO.ReadableFile file = FileIO.ReadMatches.matchToReadableFile(metadata, getCompression());
// We use this as an estimate of the size of the sampled lines. Since the last sampled line
// may exceed this range, we are over estimating the number of lines in our estimation. (If
// each line is larger than readingWindowSize we will read one line any way and that line is
// the last line)
long readingWindowSize = Math.min(getNumSampledBytesPerFile(), metadata.sizeBytes());
sampledEverything = metadata.sizeBytes() == readingWindowSize && sampledEverything;
OffsetRange range = new OffsetRange(0, readingWindowSize);
TextSource textSource = new TextSource(ValueProvider.StaticValueProvider.of(file.getMetadata().resourceId().toString()), getEmptyMatchTreatment(), getDelimiters());
FileBasedSource<String> source = CompressedSource.from(textSource).withCompression(file.getCompression());
try (BoundedSource.BoundedReader<String> reader = source.createForSubrangeOfFile(file.getMetadata(), range.getFrom(), range.getTo()).createReader(pipelineOptions)) {
int numberOfNonEmptyLines = 0;
for (boolean more = reader.start(); more; more = reader.advance()) {
numberOfNonEmptyLines += reader.getCurrent().trim().equals("") ? 0 : 1;
}
numberOfReadLines += numberOfNonEmptyLines;
linesSize += (numberOfNonEmptyLines == 0) ? 0 : readingWindowSize;
}
long fileSize = metadata.sizeBytes();
numberOfReadFiles += fileSize == 0 ? 0 : 1;
totalFileSizes += fileSize;
}
if (numberOfReadLines == 0 && sampledEverything) {
return 0d;
}
if (numberOfReadLines == 0) {
throw new NoEstimationException("Cannot estimate the row count. All the sampled lines are empty");
}
// This is total file sizes divided by average line size.
return (double) totalFileSizes * numberOfReadLines / linesSize;
}
use of org.apache.beam.sdk.io.range.OffsetRange in project beam by apache.
the class GrowableOffsetRangeTracker method trySplit.
@Override
public SplitResult<OffsetRange> trySplit(double fractionOfRemainder) {
// If current tracking range is no longer growable, split it as a normal range.
if (range.getTo() != Long.MAX_VALUE || range.getTo() == range.getFrom()) {
return super.trySplit(fractionOfRemainder);
}
// If current range has been done, there is no more space to split.
if (lastAttemptedOffset != null && lastAttemptedOffset == Long.MAX_VALUE) {
return null;
}
BigDecimal cur = (lastAttemptedOffset == null) ? BigDecimal.valueOf(range.getFrom()).subtract(BigDecimal.ONE, MathContext.DECIMAL128) : BigDecimal.valueOf(lastAttemptedOffset);
// Fetch the estimated end offset. If the estimated end is smaller than the next offset, use
// the next offset as end.
BigDecimal estimateRangeEnd = BigDecimal.valueOf(rangeEndEstimator.estimate()).max(cur.add(BigDecimal.ONE, MathContext.DECIMAL128));
// Convert to BigDecimal in computation to prevent overflow, which may result in loss of
// precision.
// split = cur + max(1, (estimateRangeEnd - cur) * fractionOfRemainder)
BigDecimal splitPos = cur.add(estimateRangeEnd.subtract(cur, MathContext.DECIMAL128).multiply(BigDecimal.valueOf(fractionOfRemainder), MathContext.DECIMAL128).max(BigDecimal.ONE), MathContext.DECIMAL128);
long split = splitPos.longValue();
if (split > estimateRangeEnd.longValue()) {
return null;
}
OffsetRange res = new OffsetRange(split, range.getTo());
this.range = new OffsetRange(range.getFrom(), split);
return SplitResult.of(range, res);
}
use of org.apache.beam.sdk.io.range.OffsetRange in project beam by apache.
the class SplittableDoFnTest method testBoundedness.
@Test
@Category(NeedsRunner.class)
public void testBoundedness() {
// use TestPipeline.create() because we assert without p.run();
Pipeline p = TestPipeline.create();
PCollection<String> foo = p.apply(Create.of("foo"));
{
PCollection<String> res = foo.apply(ParDo.of(new DoFn<String, String>() {
@ProcessElement
public void process(RestrictionTracker<OffsetRange, Long> tracker) {
// Doesn't matter
}
@GetInitialRestriction
public OffsetRange getInitialRestriction() {
return new OffsetRange(0, 1);
}
}));
assertEquals(PCollection.IsBounded.BOUNDED, res.isBounded());
}
{
PCollection<String> res = foo.apply(ParDo.of(new DoFn<String, String>() {
@ProcessElement
public ProcessContinuation process(RestrictionTracker<OffsetRange, Long> tracker) {
return stop();
}
@GetInitialRestriction
public OffsetRange getInitialRestriction() {
return new OffsetRange(0, 1);
}
}));
assertEquals(PCollection.IsBounded.UNBOUNDED, res.isBounded());
}
}
use of org.apache.beam.sdk.io.range.OffsetRange in project beam by apache.
the class GrowableOffsetRangeTrackerTest method testCheckpointJustStarted.
@Test
public void testCheckpointJustStarted() throws Exception {
SimpleEstimator simpleEstimator = new SimpleEstimator();
GrowableOffsetRangeTracker tracker = new GrowableOffsetRangeTracker(0L, simpleEstimator);
assertTrue(tracker.tryClaim(5L));
simpleEstimator.setEstimateRangeEnd(0L);
SplitResult res = tracker.trySplit(0);
tracker.checkDone();
assertEquals(new OffsetRange(0, 6), res.getPrimary());
assertEquals(new OffsetRange(0, 6), tracker.currentRestriction());
assertEquals(new OffsetRange(6, Long.MAX_VALUE), res.getResidual());
tracker = new GrowableOffsetRangeTracker(0L, simpleEstimator);
assertTrue(tracker.tryClaim(5L));
simpleEstimator.setEstimateRangeEnd(20L);
res = tracker.trySplit(0);
tracker.checkDone();
assertEquals(new OffsetRange(0, 6), res.getPrimary());
assertEquals(new OffsetRange(6, Long.MAX_VALUE), res.getResidual());
}
Aggregations