Search in sources :

Example 41 with OffsetRange

use of org.apache.beam.sdk.io.range.OffsetRange in project beam by apache.

the class OutputAndTimeBoundedSplittableProcessElementInvokerTest method testInvokeProcessElementTimeBounded.

@Test
public void testInvokeProcessElementTimeBounded() throws Exception {
    SplittableProcessElementInvoker<Void, String, OffsetRange, Long, Void>.Result res = runTest(10000, Duration.ZERO, Integer.MAX_VALUE, Duration.millis(100));
    assertFalse(res.getContinuation().shouldResume());
    OffsetRange residualRange = res.getResidualRestriction();
    // Should process ideally around 30 elements - but due to timing flakiness, we can't enforce
    // that precisely. Just test that it's not egregiously off.
    assertThat(residualRange.getFrom(), greaterThan(10L));
    assertThat(residualRange.getFrom(), lessThan(100L));
    assertEquals(10000, residualRange.getTo());
}
Also used : OffsetRange(org.apache.beam.sdk.io.range.OffsetRange) Test(org.junit.Test)

Example 42 with OffsetRange

use of org.apache.beam.sdk.io.range.OffsetRange in project beam by apache.

the class TextRowCountEstimator method estimateRowCount.

/**
 * Estimates the number of non empty rows. It samples NumSampledBytesPerFile bytes from every file
 * until the condition in sampling strategy is met. Then it takes the average line size of the
 * rows and divides the total file sizes by that number. If all the sampled rows are empty, and it
 * has not sampled all the lines (due to sampling strategy) it throws Exception.
 *
 * @return Number of estimated rows.
 * @throws org.apache.beam.sdk.io.TextRowCountEstimator.NoEstimationException if all the sampled
 *     lines are empty and we have not read all the lines in the matched files.
 */
@SuppressFBWarnings(value = "RCN_REDUNDANT_NULLCHECK_WOULD_HAVE_BEEN_A_NPE", justification = "https://github.com/spotbugs/spotbugs/issues/756")
public Double estimateRowCount(PipelineOptions pipelineOptions) throws IOException, NoEstimationException {
    long linesSize = 0;
    int numberOfReadLines = 0;
    long totalFileSizes = 0;
    long totalSampledBytes = 0;
    int numberOfReadFiles = 0;
    boolean sampledEverything = true;
    MatchResult match = FileSystems.match(getFilePattern(), getEmptyMatchTreatment());
    for (MatchResult.Metadata metadata : match.metadata()) {
        if (getSamplingStrategy().stopSampling(numberOfReadFiles, totalSampledBytes)) {
            sampledEverything = false;
            break;
        }
        if (FileIO.ReadMatches.shouldSkipDirectory(metadata, getDirectoryTreatment())) {
            continue;
        }
        FileIO.ReadableFile file = FileIO.ReadMatches.matchToReadableFile(metadata, getCompression());
        // We use this as an estimate of the size of the sampled lines. Since the last sampled line
        // may exceed this range, we are over estimating the number of lines in our estimation. (If
        // each line is larger than readingWindowSize we will read one line any way and that line is
        // the last line)
        long readingWindowSize = Math.min(getNumSampledBytesPerFile(), metadata.sizeBytes());
        sampledEverything = metadata.sizeBytes() == readingWindowSize && sampledEverything;
        OffsetRange range = new OffsetRange(0, readingWindowSize);
        TextSource textSource = new TextSource(ValueProvider.StaticValueProvider.of(file.getMetadata().resourceId().toString()), getEmptyMatchTreatment(), getDelimiters());
        FileBasedSource<String> source = CompressedSource.from(textSource).withCompression(file.getCompression());
        try (BoundedSource.BoundedReader<String> reader = source.createForSubrangeOfFile(file.getMetadata(), range.getFrom(), range.getTo()).createReader(pipelineOptions)) {
            int numberOfNonEmptyLines = 0;
            for (boolean more = reader.start(); more; more = reader.advance()) {
                numberOfNonEmptyLines += reader.getCurrent().trim().equals("") ? 0 : 1;
            }
            numberOfReadLines += numberOfNonEmptyLines;
            linesSize += (numberOfNonEmptyLines == 0) ? 0 : readingWindowSize;
        }
        long fileSize = metadata.sizeBytes();
        numberOfReadFiles += fileSize == 0 ? 0 : 1;
        totalFileSizes += fileSize;
    }
    if (numberOfReadLines == 0 && sampledEverything) {
        return 0d;
    }
    if (numberOfReadLines == 0) {
        throw new NoEstimationException("Cannot estimate the row count. All the sampled lines are empty");
    }
    // This is total file sizes divided by average line size.
    return (double) totalFileSizes * numberOfReadLines / linesSize;
}
Also used : MatchResult(org.apache.beam.sdk.io.fs.MatchResult) OffsetRange(org.apache.beam.sdk.io.range.OffsetRange) SuppressFBWarnings(edu.umd.cs.findbugs.annotations.SuppressFBWarnings)

Example 43 with OffsetRange

use of org.apache.beam.sdk.io.range.OffsetRange in project beam by apache.

the class GrowableOffsetRangeTracker method trySplit.

@Override
public SplitResult<OffsetRange> trySplit(double fractionOfRemainder) {
    // If current tracking range is no longer growable, split it as a normal range.
    if (range.getTo() != Long.MAX_VALUE || range.getTo() == range.getFrom()) {
        return super.trySplit(fractionOfRemainder);
    }
    // If current range has been done, there is no more space to split.
    if (lastAttemptedOffset != null && lastAttemptedOffset == Long.MAX_VALUE) {
        return null;
    }
    BigDecimal cur = (lastAttemptedOffset == null) ? BigDecimal.valueOf(range.getFrom()).subtract(BigDecimal.ONE, MathContext.DECIMAL128) : BigDecimal.valueOf(lastAttemptedOffset);
    // Fetch the estimated end offset. If the estimated end is smaller than the next offset, use
    // the next offset as end.
    BigDecimal estimateRangeEnd = BigDecimal.valueOf(rangeEndEstimator.estimate()).max(cur.add(BigDecimal.ONE, MathContext.DECIMAL128));
    // Convert to BigDecimal in computation to prevent overflow, which may result in loss of
    // precision.
    // split = cur + max(1, (estimateRangeEnd - cur) * fractionOfRemainder)
    BigDecimal splitPos = cur.add(estimateRangeEnd.subtract(cur, MathContext.DECIMAL128).multiply(BigDecimal.valueOf(fractionOfRemainder), MathContext.DECIMAL128).max(BigDecimal.ONE), MathContext.DECIMAL128);
    long split = splitPos.longValue();
    if (split > estimateRangeEnd.longValue()) {
        return null;
    }
    OffsetRange res = new OffsetRange(split, range.getTo());
    this.range = new OffsetRange(range.getFrom(), split);
    return SplitResult.of(range, res);
}
Also used : OffsetRange(org.apache.beam.sdk.io.range.OffsetRange) BigDecimal(java.math.BigDecimal)

Example 44 with OffsetRange

use of org.apache.beam.sdk.io.range.OffsetRange in project beam by apache.

the class SplittableDoFnTest method testBoundedness.

@Test
@Category(NeedsRunner.class)
public void testBoundedness() {
    // use TestPipeline.create() because we assert without p.run();
    Pipeline p = TestPipeline.create();
    PCollection<String> foo = p.apply(Create.of("foo"));
    {
        PCollection<String> res = foo.apply(ParDo.of(new DoFn<String, String>() {

            @ProcessElement
            public void process(RestrictionTracker<OffsetRange, Long> tracker) {
            // Doesn't matter
            }

            @GetInitialRestriction
            public OffsetRange getInitialRestriction() {
                return new OffsetRange(0, 1);
            }
        }));
        assertEquals(PCollection.IsBounded.BOUNDED, res.isBounded());
    }
    {
        PCollection<String> res = foo.apply(ParDo.of(new DoFn<String, String>() {

            @ProcessElement
            public ProcessContinuation process(RestrictionTracker<OffsetRange, Long> tracker) {
                return stop();
            }

            @GetInitialRestriction
            public OffsetRange getInitialRestriction() {
                return new OffsetRange(0, 1);
            }
        }));
        assertEquals(PCollection.IsBounded.UNBOUNDED, res.isBounded());
    }
}
Also used : OffsetRange(org.apache.beam.sdk.io.range.OffsetRange) PCollection(org.apache.beam.sdk.values.PCollection) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 45 with OffsetRange

use of org.apache.beam.sdk.io.range.OffsetRange in project beam by apache.

the class GrowableOffsetRangeTrackerTest method testCheckpointJustStarted.

@Test
public void testCheckpointJustStarted() throws Exception {
    SimpleEstimator simpleEstimator = new SimpleEstimator();
    GrowableOffsetRangeTracker tracker = new GrowableOffsetRangeTracker(0L, simpleEstimator);
    assertTrue(tracker.tryClaim(5L));
    simpleEstimator.setEstimateRangeEnd(0L);
    SplitResult res = tracker.trySplit(0);
    tracker.checkDone();
    assertEquals(new OffsetRange(0, 6), res.getPrimary());
    assertEquals(new OffsetRange(0, 6), tracker.currentRestriction());
    assertEquals(new OffsetRange(6, Long.MAX_VALUE), res.getResidual());
    tracker = new GrowableOffsetRangeTracker(0L, simpleEstimator);
    assertTrue(tracker.tryClaim(5L));
    simpleEstimator.setEstimateRangeEnd(20L);
    res = tracker.trySplit(0);
    tracker.checkDone();
    assertEquals(new OffsetRange(0, 6), res.getPrimary());
    assertEquals(new OffsetRange(6, Long.MAX_VALUE), res.getResidual());
}
Also used : OffsetRange(org.apache.beam.sdk.io.range.OffsetRange) Test(org.junit.Test)

Aggregations

OffsetRange (org.apache.beam.sdk.io.range.OffsetRange)63 Test (org.junit.Test)53 Instant (org.joda.time.Instant)8 ArrayList (java.util.ArrayList)5 OffsetRangeTracker (org.apache.beam.sdk.transforms.splittabledofn.OffsetRangeTracker)5 Progress (org.apache.beam.sdk.transforms.splittabledofn.RestrictionTracker.Progress)5 ProcessContinuation (org.apache.beam.sdk.transforms.DoFn.ProcessContinuation)4 PartitionMetadata (org.apache.beam.sdk.io.gcp.spanner.changestreams.model.PartitionMetadata)3 DoFn (org.apache.beam.sdk.transforms.DoFn)3 BigDecimal (java.math.BigDecimal)2 RestrictionTracker (org.apache.beam.sdk.transforms.splittabledofn.RestrictionTracker)2 Offset (com.google.cloud.pubsublite.Offset)1 SuppressFBWarnings (edu.umd.cs.findbugs.annotations.SuppressFBWarnings)1 Scope (io.opencensus.common.Scope)1 AttributeValue (io.opencensus.trace.AttributeValue)1 Tracer (io.opencensus.trace.Tracer)1 Tracing (io.opencensus.trace.Tracing)1 Serializable (java.io.Serializable)1 Map (java.util.Map)1 Optional (java.util.Optional)1