use of org.apache.arrow.vector.ipc.ArrowReader in project spark-bigquery-connector by GoogleCloudDataproc.
the class ParallelArrowReader method close.
@Override
public void close() {
rootTracer.finished();
// Try to force reader thread to stop.
if (readerThread != null) {
readersReady.set(0);
readerThread.interrupt();
try {
readerThread.join(10000);
} catch (InterruptedException e) {
log.info("Interrupted while waiting for reader thread to finish.");
}
if (readerThread.isAlive()) {
log.warn("Reader thread did not shutdown in 10 seconds.");
} else {
log.info("Reader thread stopped. Queue size: {}", queue.size());
}
}
// Stop any queued tasks from processing.
executor.shutdownNow();
try {
if (!executor.awaitTermination(10, TimeUnit.SECONDS)) {
log.warn("executor did not terminate after 10 seconds");
}
} catch (InterruptedException e) {
log.info("Interrupted when awaiting executor termination");
}
queue.stream().filter(x -> x instanceof ArrowRecordBatch).map(x -> (ArrowRecordBatch) x).forEach(ArrowRecordBatch::close);
for (BigQueryStorageReadRowsTracer tracer : tracers) {
tracer.finished();
}
for (ArrowReader reader : readers) {
try {
// Don't close the stream here because it will consume all of it.
// We let other components worry about stream closure.
reader.close(/*close underlying channel*/
false);
} catch (Exception e) {
log.info("Trouble closing delegate readers", e);
}
}
}
use of org.apache.arrow.vector.ipc.ArrowReader in project spark-bigquery-connector by GoogleCloudDataproc.
the class ParallelArrowReader method consumeReaders.
private void consumeReaders() {
try {
// Tracks which readers have exhausted all of there elements
AtomicBoolean[] hasData = new AtomicBoolean[readers.size()];
long[] lastBytesRead = new long[readers.size()];
VectorUnloader[] unloader = new VectorUnloader[readers.size()];
VectorSchemaRoot[] roots = new VectorSchemaRoot[readers.size()];
for (int x = 0; x < hasData.length; x++) {
hasData[x] = new AtomicBoolean();
hasData[x].set(true);
lastBytesRead[x] = 0;
roots[x] = readers.get(x).getVectorSchemaRoot();
unloader[x] = new VectorUnloader(roots[x], /*includeNullCount=*/
true, /*alignBuffers=*/
false);
tracers[x].startStream();
}
while (readersReady.get() > 0) {
for (int readerIdx = 0; readerIdx < readers.size(); readerIdx++) {
// tasks.
if (!hasData[readerIdx].get()) {
continue;
}
ArrowReader reader = readers.get(readerIdx);
final int idx = readerIdx;
queueSemaphore.acquire();
executor.submit(() -> {
synchronized (roots[idx]) {
if (!hasData[idx].get()) {
return;
}
try {
tracers[idx].readRowsResponseRequested();
hasData[idx].set(reader.loadNextBatch());
if (!hasData[idx].get()) {
queueSemaphore.release();
}
long incrementalBytesRead = reader.bytesRead() - lastBytesRead[idx];
tracers[idx].readRowsResponseObtained(/*bytesReceived=*/
incrementalBytesRead);
lastBytesRead[idx] = reader.bytesRead();
} catch (Throwable e) {
log.info("Exception caught while consuming reader.", e);
hasData[idx].set(false);
readersReady.set(0);
Preconditions.checkState(queue.offer(e), "Expected space in queue");
}
ArrowRecordBatch batch = null;
if (!hasData[idx].get()) {
readersReady.addAndGet(-1);
return;
}
int rows = 0;
try {
rows = reader.getVectorSchemaRoot().getRowCount();
} catch (IOException e) {
queue.offer(e);
}
// Not quite parsing but re-use it here.
tracers[idx].rowsParseStarted();
batch = unloader[idx].getRecordBatch();
tracers[idx].rowsParseFinished(rows);
try {
Preconditions.checkState(queue.offer(batch), "Expected space in queue");
} catch (Exception e) {
batch.close();
throw e;
}
}
});
}
}
} catch (Throwable e) {
log.info("Read ahead caught exceptions", e);
Preconditions.checkState(queue.offer(e), "Expected available capacity");
return;
}
Preconditions.checkState(queue.offer(DONE_SENTINEL), "Expected available capacity");
}
use of org.apache.arrow.vector.ipc.ArrowReader in project spark-bigquery-connector by GoogleCloudDataproc.
the class ParallelArrowReaderTest method testExceptionIsPropagatedFromNext.
// @Test
// public void testReadsAllBatchesInRoundRobin() throws Exception {
// ArrowReader r1 = getReaderWithSequence(0);
// ArrowReader r2 = getReaderWithSequence(1, 3);
// ArrowReader r3 = getReaderWithSequence(2, 4, 5);
// ExecutorService executor = Executors.newFixedThreadPool(3);
// List<Integer> read = new ArrayList<>();
// try (VectorSchemaRoot root =
// VectorSchemaRoot.create(r1.getVectorSchemaRoot().getSchema(), allocator)) {
// VectorLoader loader = new VectorLoader(root);
// ParallelArrowReader reader =
// new ParallelArrowReader(
// ImmutableList.of(r1, r2, r3),
// executor,
// loader,
// new LoggingBigQueryStorageReadRowsTracer("stream_name", 2));
//
// while (reader.next()) {
// read.add(((IntVector) root.getVector(0)).get(0));
// }
// reader.close();
// }
//
// assertThat(read).containsExactlyElementsIn(ImmutableList.of(0, 1, 2, 3, 4, 5)).inOrder();
// assertThat(executor.isShutdown()).isTrue();
// }
// @Test
// public void testReadsAllBatchesInRoundRobinOneelement() throws Exception {
//
// ColumnarBatch[] batches = new ColumnarBatch[6];
// for (int x = 0; x < batches.length; x++) {
// batches[x] = new ColumnarBatch(new ColumnVector[0]);
// }
// ArrowReader r1 = getReaderWithSequence();
// ArrowReader r2 = getReaderWithSequence(0);
// ArrowReader r3 = getReaderWithSequence();
// ExecutorService executor = Executors.newFixedThreadPool(3);
// List<Integer> read = new ArrayList<>();
// try (VectorSchemaRoot root =
// VectorSchemaRoot.create(r1.getVectorSchemaRoot().getSchema(), allocator)) {
// VectorLoader loader = new VectorLoader(root);
// ParallelArrowReader reader =
// new ParallelArrowReader(
// ImmutableList.of(r1, r2, r3),
// executor,
// loader,
// new LoggingBigQueryStorageReadRowsTracer("stream_name", 2));
//
// while (reader.next()) {
// read.add(((IntVector) root.getVector(0)).get(0));
// }
// reader.close();
// }
//
// assertThat(read).containsExactlyElementsIn(ImmutableList.of(0)).inOrder();
// assertThat(executor.isShutdown()).isTrue();
// }
@Test
public void testExceptionIsPropagatedFromNext() throws Exception {
IOException exception = new IOException("an exception");
ArrowReader r1 = mock(ArrowReader.class);
when(r1.loadNextBatch()).thenThrow(exception);
ExecutorService executor = MoreExecutors.newDirectExecutorService();
try (VectorSchemaRoot root = new VectorSchemaRoot(ImmutableList.of());
VectorSchemaRoot root2 = new VectorSchemaRoot(ImmutableList.of())) {
when(r1.getVectorSchemaRoot()).thenReturn(root2);
ParallelArrowReader reader = new ParallelArrowReader(ImmutableList.of(r1), executor, new VectorLoader(root), new LoggingBigQueryStorageReadRowsTracer("stream_name", 2));
IOException e = Assert.assertThrows(IOException.class, reader::next);
assertThat(e).isSameInstanceAs(exception);
}
}
use of org.apache.arrow.vector.ipc.ArrowReader in project spark-bigquery-connector by GoogleCloudDataproc.
the class ParallelArrowReaderTest method testInterruptsOnClose.
@Test
public void testInterruptsOnClose() throws Exception {
try (VectorSchemaRoot root = VectorSchemaRoot.of()) {
ArrowReader r1 = mock(ArrowReader.class);
when(r1.loadNextBatch()).thenReturn(true);
when(r1.getVectorSchemaRoot()).thenReturn(root);
CountDownLatch latch = new CountDownLatch(1);
ArrowReader r2 = mock(ArrowReader.class);
when(r2.loadNextBatch()).thenAnswer((InvocationOnMock invocation) -> {
latch.countDown();
MILLISECONDS.sleep(100);
return true;
});
when(r2.getVectorSchemaRoot()).thenReturn(root);
VectorLoader loader = mock(VectorLoader.class);
ExecutorService executor = Executors.newSingleThreadExecutor();
ParallelArrowReader reader = new ParallelArrowReader(ImmutableList.of(r1, r2), executor, loader, new LoggingBigQueryStorageReadRowsTracer("stream_name", 2));
ExecutorService oneOff = Executors.newSingleThreadExecutor();
Instant start = Instant.now();
Future<Instant> endTime = oneOff.submit(() -> {
try {
while (reader.next()) {
}
} catch (Exception e) {
if (e.getCause() == null || !(e.getCause() instanceof InterruptedException)) {
return Instant.ofEpochMilli(0);
}
}
return Instant.now();
});
// Wait until next gets called.
latch.await();
// Should interrupt blocking operations.
oneOff.shutdownNow();
reader.close();
assertThat(endTime.get()).isGreaterThan(start);
assertThat(Duration.between(start, endTime.get())).isLessThan(Duration.ofMillis(100));
}
}
Aggregations