Search in sources :

Example 1 with ArrowReader

use of org.apache.arrow.vector.ipc.ArrowReader in project spark-bigquery-connector by GoogleCloudDataproc.

the class ParallelArrowReader method close.

@Override
public void close() {
    rootTracer.finished();
    // Try to force reader thread to stop.
    if (readerThread != null) {
        readersReady.set(0);
        readerThread.interrupt();
        try {
            readerThread.join(10000);
        } catch (InterruptedException e) {
            log.info("Interrupted while waiting for reader thread to finish.");
        }
        if (readerThread.isAlive()) {
            log.warn("Reader thread did not shutdown in 10 seconds.");
        } else {
            log.info("Reader thread stopped.  Queue size: {}", queue.size());
        }
    }
    // Stop any queued tasks from processing.
    executor.shutdownNow();
    try {
        if (!executor.awaitTermination(10, TimeUnit.SECONDS)) {
            log.warn("executor did not terminate after 10 seconds");
        }
    } catch (InterruptedException e) {
        log.info("Interrupted when awaiting executor termination");
    }
    queue.stream().filter(x -> x instanceof ArrowRecordBatch).map(x -> (ArrowRecordBatch) x).forEach(ArrowRecordBatch::close);
    for (BigQueryStorageReadRowsTracer tracer : tracers) {
        tracer.finished();
    }
    for (ArrowReader reader : readers) {
        try {
            // Don't close the stream here because it will consume all of it.
            // We let other components worry about stream closure.
            reader.close(/*close underlying channel*/
            false);
        } catch (Exception e) {
            log.info("Trouble closing delegate readers", e);
        }
    }
}
Also used : VectorLoader(org.apache.arrow.vector.VectorLoader) ArrowReader(org.apache.arrow.vector.ipc.ArrowReader) Logger(org.slf4j.Logger) Semaphore(java.util.concurrent.Semaphore) VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) LoggerFactory(org.slf4j.LoggerFactory) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) IOException(java.io.IOException) BlockingQueue(java.util.concurrent.BlockingQueue) TimeUnit(java.util.concurrent.TimeUnit) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) List(java.util.List) ArrowRecordBatch(org.apache.arrow.vector.ipc.message.ArrowRecordBatch) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) VectorUnloader(org.apache.arrow.vector.VectorUnloader) Preconditions(com.google.common.base.Preconditions) ExecutorService(java.util.concurrent.ExecutorService) ArrowRecordBatch(org.apache.arrow.vector.ipc.message.ArrowRecordBatch) ArrowReader(org.apache.arrow.vector.ipc.ArrowReader) IOException(java.io.IOException)

Example 2 with ArrowReader

use of org.apache.arrow.vector.ipc.ArrowReader in project spark-bigquery-connector by GoogleCloudDataproc.

the class ParallelArrowReader method consumeReaders.

private void consumeReaders() {
    try {
        // Tracks which readers have exhausted all of there elements
        AtomicBoolean[] hasData = new AtomicBoolean[readers.size()];
        long[] lastBytesRead = new long[readers.size()];
        VectorUnloader[] unloader = new VectorUnloader[readers.size()];
        VectorSchemaRoot[] roots = new VectorSchemaRoot[readers.size()];
        for (int x = 0; x < hasData.length; x++) {
            hasData[x] = new AtomicBoolean();
            hasData[x].set(true);
            lastBytesRead[x] = 0;
            roots[x] = readers.get(x).getVectorSchemaRoot();
            unloader[x] = new VectorUnloader(roots[x], /*includeNullCount=*/
            true, /*alignBuffers=*/
            false);
            tracers[x].startStream();
        }
        while (readersReady.get() > 0) {
            for (int readerIdx = 0; readerIdx < readers.size(); readerIdx++) {
                // tasks.
                if (!hasData[readerIdx].get()) {
                    continue;
                }
                ArrowReader reader = readers.get(readerIdx);
                final int idx = readerIdx;
                queueSemaphore.acquire();
                executor.submit(() -> {
                    synchronized (roots[idx]) {
                        if (!hasData[idx].get()) {
                            return;
                        }
                        try {
                            tracers[idx].readRowsResponseRequested();
                            hasData[idx].set(reader.loadNextBatch());
                            if (!hasData[idx].get()) {
                                queueSemaphore.release();
                            }
                            long incrementalBytesRead = reader.bytesRead() - lastBytesRead[idx];
                            tracers[idx].readRowsResponseObtained(/*bytesReceived=*/
                            incrementalBytesRead);
                            lastBytesRead[idx] = reader.bytesRead();
                        } catch (Throwable e) {
                            log.info("Exception caught while consuming reader.", e);
                            hasData[idx].set(false);
                            readersReady.set(0);
                            Preconditions.checkState(queue.offer(e), "Expected space in queue");
                        }
                        ArrowRecordBatch batch = null;
                        if (!hasData[idx].get()) {
                            readersReady.addAndGet(-1);
                            return;
                        }
                        int rows = 0;
                        try {
                            rows = reader.getVectorSchemaRoot().getRowCount();
                        } catch (IOException e) {
                            queue.offer(e);
                        }
                        // Not quite parsing but re-use it here.
                        tracers[idx].rowsParseStarted();
                        batch = unloader[idx].getRecordBatch();
                        tracers[idx].rowsParseFinished(rows);
                        try {
                            Preconditions.checkState(queue.offer(batch), "Expected space in queue");
                        } catch (Exception e) {
                            batch.close();
                            throw e;
                        }
                    }
                });
            }
        }
    } catch (Throwable e) {
        log.info("Read ahead caught exceptions", e);
        Preconditions.checkState(queue.offer(e), "Expected available capacity");
        return;
    }
    Preconditions.checkState(queue.offer(DONE_SENTINEL), "Expected available capacity");
}
Also used : VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) IOException(java.io.IOException) IOException(java.io.IOException) VectorUnloader(org.apache.arrow.vector.VectorUnloader) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) ArrowRecordBatch(org.apache.arrow.vector.ipc.message.ArrowRecordBatch) ArrowReader(org.apache.arrow.vector.ipc.ArrowReader)

Example 3 with ArrowReader

use of org.apache.arrow.vector.ipc.ArrowReader in project spark-bigquery-connector by GoogleCloudDataproc.

the class ParallelArrowReaderTest method testExceptionIsPropagatedFromNext.

// @Test
// public void testReadsAllBatchesInRoundRobin() throws Exception {
// ArrowReader r1 = getReaderWithSequence(0);
// ArrowReader r2 = getReaderWithSequence(1, 3);
// ArrowReader r3 = getReaderWithSequence(2, 4, 5);
// ExecutorService executor = Executors.newFixedThreadPool(3);
// List<Integer> read = new ArrayList<>();
// try (VectorSchemaRoot root =
// VectorSchemaRoot.create(r1.getVectorSchemaRoot().getSchema(), allocator)) {
// VectorLoader loader = new VectorLoader(root);
// ParallelArrowReader reader =
// new ParallelArrowReader(
// ImmutableList.of(r1, r2, r3),
// executor,
// loader,
// new LoggingBigQueryStorageReadRowsTracer("stream_name", 2));
// 
// while (reader.next()) {
// read.add(((IntVector) root.getVector(0)).get(0));
// }
// reader.close();
// }
// 
// assertThat(read).containsExactlyElementsIn(ImmutableList.of(0, 1, 2, 3, 4, 5)).inOrder();
// assertThat(executor.isShutdown()).isTrue();
// }
// @Test
// public void testReadsAllBatchesInRoundRobinOneelement() throws Exception {
// 
// ColumnarBatch[] batches = new ColumnarBatch[6];
// for (int x = 0; x < batches.length; x++) {
// batches[x] = new ColumnarBatch(new ColumnVector[0]);
// }
// ArrowReader r1 = getReaderWithSequence();
// ArrowReader r2 = getReaderWithSequence(0);
// ArrowReader r3 = getReaderWithSequence();
// ExecutorService executor = Executors.newFixedThreadPool(3);
// List<Integer> read = new ArrayList<>();
// try (VectorSchemaRoot root =
// VectorSchemaRoot.create(r1.getVectorSchemaRoot().getSchema(), allocator)) {
// VectorLoader loader = new VectorLoader(root);
// ParallelArrowReader reader =
// new ParallelArrowReader(
// ImmutableList.of(r1, r2, r3),
// executor,
// loader,
// new LoggingBigQueryStorageReadRowsTracer("stream_name", 2));
// 
// while (reader.next()) {
// read.add(((IntVector) root.getVector(0)).get(0));
// }
// reader.close();
// }
// 
// assertThat(read).containsExactlyElementsIn(ImmutableList.of(0)).inOrder();
// assertThat(executor.isShutdown()).isTrue();
// }
@Test
public void testExceptionIsPropagatedFromNext() throws Exception {
    IOException exception = new IOException("an exception");
    ArrowReader r1 = mock(ArrowReader.class);
    when(r1.loadNextBatch()).thenThrow(exception);
    ExecutorService executor = MoreExecutors.newDirectExecutorService();
    try (VectorSchemaRoot root = new VectorSchemaRoot(ImmutableList.of());
        VectorSchemaRoot root2 = new VectorSchemaRoot(ImmutableList.of())) {
        when(r1.getVectorSchemaRoot()).thenReturn(root2);
        ParallelArrowReader reader = new ParallelArrowReader(ImmutableList.of(r1), executor, new VectorLoader(root), new LoggingBigQueryStorageReadRowsTracer("stream_name", 2));
        IOException e = Assert.assertThrows(IOException.class, reader::next);
        assertThat(e).isSameInstanceAs(exception);
    }
}
Also used : VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) VectorLoader(org.apache.arrow.vector.VectorLoader) ExecutorService(java.util.concurrent.ExecutorService) IOException(java.io.IOException) ArrowReader(org.apache.arrow.vector.ipc.ArrowReader) Test(org.junit.Test)

Example 4 with ArrowReader

use of org.apache.arrow.vector.ipc.ArrowReader in project spark-bigquery-connector by GoogleCloudDataproc.

the class ParallelArrowReaderTest method testInterruptsOnClose.

@Test
public void testInterruptsOnClose() throws Exception {
    try (VectorSchemaRoot root = VectorSchemaRoot.of()) {
        ArrowReader r1 = mock(ArrowReader.class);
        when(r1.loadNextBatch()).thenReturn(true);
        when(r1.getVectorSchemaRoot()).thenReturn(root);
        CountDownLatch latch = new CountDownLatch(1);
        ArrowReader r2 = mock(ArrowReader.class);
        when(r2.loadNextBatch()).thenAnswer((InvocationOnMock invocation) -> {
            latch.countDown();
            MILLISECONDS.sleep(100);
            return true;
        });
        when(r2.getVectorSchemaRoot()).thenReturn(root);
        VectorLoader loader = mock(VectorLoader.class);
        ExecutorService executor = Executors.newSingleThreadExecutor();
        ParallelArrowReader reader = new ParallelArrowReader(ImmutableList.of(r1, r2), executor, loader, new LoggingBigQueryStorageReadRowsTracer("stream_name", 2));
        ExecutorService oneOff = Executors.newSingleThreadExecutor();
        Instant start = Instant.now();
        Future<Instant> endTime = oneOff.submit(() -> {
            try {
                while (reader.next()) {
                }
            } catch (Exception e) {
                if (e.getCause() == null || !(e.getCause() instanceof InterruptedException)) {
                    return Instant.ofEpochMilli(0);
                }
            }
            return Instant.now();
        });
        // Wait until next gets called.
        latch.await();
        // Should interrupt blocking operations.
        oneOff.shutdownNow();
        reader.close();
        assertThat(endTime.get()).isGreaterThan(start);
        assertThat(Duration.between(start, endTime.get())).isLessThan(Duration.ofMillis(100));
    }
}
Also used : VectorSchemaRoot(org.apache.arrow.vector.VectorSchemaRoot) VectorLoader(org.apache.arrow.vector.VectorLoader) InvocationOnMock(org.mockito.invocation.InvocationOnMock) Instant(java.time.Instant) ExecutorService(java.util.concurrent.ExecutorService) CountDownLatch(java.util.concurrent.CountDownLatch) ArrowReader(org.apache.arrow.vector.ipc.ArrowReader) IOException(java.io.IOException) Test(org.junit.Test)

Aggregations

IOException (java.io.IOException)4 VectorSchemaRoot (org.apache.arrow.vector.VectorSchemaRoot)4 ArrowReader (org.apache.arrow.vector.ipc.ArrowReader)4 ExecutorService (java.util.concurrent.ExecutorService)3 VectorLoader (org.apache.arrow.vector.VectorLoader)3 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)2 VectorUnloader (org.apache.arrow.vector.VectorUnloader)2 ArrowRecordBatch (org.apache.arrow.vector.ipc.message.ArrowRecordBatch)2 Test (org.junit.Test)2 Preconditions (com.google.common.base.Preconditions)1 Instant (java.time.Instant)1 List (java.util.List)1 ArrayBlockingQueue (java.util.concurrent.ArrayBlockingQueue)1 BlockingQueue (java.util.concurrent.BlockingQueue)1 CountDownLatch (java.util.concurrent.CountDownLatch)1 Semaphore (java.util.concurrent.Semaphore)1 TimeUnit (java.util.concurrent.TimeUnit)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 InvocationOnMock (org.mockito.invocation.InvocationOnMock)1 Logger (org.slf4j.Logger)1