use of org.apache.druid.java.util.common.parsers.CloseableIterator in project druid by druid-io.
the class TimedShutoffInputSourceReader method decorateShutdownTimeout.
private <T> CloseableIterator<T> decorateShutdownTimeout(ScheduledExecutorService exec, CloseableIterator<T> delegateIterator) {
final Closer closer = Closer.create();
closer.register(delegateIterator);
closer.register(exec::shutdownNow);
final CloseableIterator<T> wrappingIterator = new CloseableIterator<T>() {
/**
* Indicates this iterator has been closed or not.
* Volatile since there is a happens-before relationship between {@link #hasNext()} and {@link #close()}.
*/
volatile boolean closed;
/**
* Caching the next item. The item returned from the underling iterator is either a non-null {@link InputRow}
* or {@link InputRowListPlusRawValues}.
* Not volatile since {@link #hasNext()} and {@link #next()} are supposed to be called by the same thread.
*/
T next = null;
@Override
public boolean hasNext() {
if (next != null) {
return true;
}
if (!closed && delegateIterator.hasNext()) {
next = delegateIterator.next();
return true;
} else {
return false;
}
}
@Override
public T next() {
if (next != null) {
final T returnValue = next;
next = null;
return returnValue;
} else {
throw new NoSuchElementException();
}
}
@Override
public void close() throws IOException {
closed = true;
closer.close();
}
};
exec.schedule(() -> {
LOG.info("Closing delegate inputSource.");
try {
wrappingIterator.close();
} catch (IOException e) {
LOG.warn(e, "Failed to close delegate inputSource, ignoring.");
}
}, shutoffTime.getMillis() - System.currentTimeMillis(), TimeUnit.MILLISECONDS);
return wrappingIterator;
}
use of org.apache.druid.java.util.common.parsers.CloseableIterator in project druid by druid-io.
the class IntermediateRowParsingReader method read.
@Override
public CloseableIterator<InputRow> read() throws IOException {
final CloseableIteratorWithMetadata<T> intermediateRowIteratorWithMetadata = intermediateRowIteratorWithMetadata();
return new CloseableIterator<InputRow>() {
// since parseInputRows() returns a list, the below line always iterates over the list,
// which means it calls Iterator.hasNext() and Iterator.next() at least once per row.
// This could be unnecessary if the row wouldn't be exploded into multiple inputRows.
// If this line turned out to be a performance bottleneck, perhaps parseInputRows() interface might not be a
// good idea. Subclasses could implement read() with some duplicate codes to avoid unnecessary iteration on
// a singleton list.
Iterator<InputRow> rows = null;
long currentRecordNumber = 1;
@Override
public boolean hasNext() {
if (rows == null || !rows.hasNext()) {
if (!intermediateRowIteratorWithMetadata.hasNext()) {
return false;
}
final T row = intermediateRowIteratorWithMetadata.next();
try {
rows = parseInputRows(row).iterator();
++currentRecordNumber;
} catch (IOException e) {
final Map<String, Object> metadata = intermediateRowIteratorWithMetadata.currentMetadata();
rows = new ExceptionThrowingIterator(new ParseException(String.valueOf(row), e, buildParseExceptionMessage(StringUtils.format("Unable to parse row [%s]", row), source(), currentRecordNumber, metadata)));
} catch (ParseException e) {
final Map<String, Object> metadata = intermediateRowIteratorWithMetadata.currentMetadata();
// Replace the message of the ParseException e
rows = new ExceptionThrowingIterator(new ParseException(e.getInput(), e.isFromPartiallyValidRow(), buildParseExceptionMessage(e.getMessage(), source(), currentRecordNumber, metadata)));
}
}
return true;
}
@Override
public InputRow next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
return rows.next();
}
@Override
public void close() throws IOException {
intermediateRowIteratorWithMetadata.close();
}
};
}
use of org.apache.druid.java.util.common.parsers.CloseableIterator in project druid by druid-io.
the class IndexTask method collectIntervalsAndShardSpecs.
private Map<Interval, Optional<HyperLogLogCollector>> collectIntervalsAndShardSpecs(ObjectMapper jsonMapper, IndexIngestionSpec ingestionSchema, InputSource inputSource, File tmpDir, GranularitySpec granularitySpec, @Nonnull PartitionsSpec partitionsSpec, boolean determineIntervals) throws IOException {
final Map<Interval, Optional<HyperLogLogCollector>> hllCollectors = new TreeMap<>(Comparators.intervalsByStartThenEnd());
final Granularity queryGranularity = granularitySpec.getQueryGranularity();
final Predicate<InputRow> rowFilter = inputRow -> {
if (inputRow == null) {
return false;
}
if (determineIntervals) {
return true;
}
final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
return optInterval.isPresent();
};
try (final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(tmpDir, ingestionSchema.getDataSchema(), inputSource, inputSource.needsFormat() ? getInputFormat(ingestionSchema) : null, rowFilter, determinePartitionsMeters, determinePartitionsParseExceptionHandler)) {
while (inputRowIterator.hasNext()) {
final InputRow inputRow = inputRowIterator.next();
final Interval interval;
if (determineIntervals) {
interval = granularitySpec.getSegmentGranularity().bucket(inputRow.getTimestamp());
} else {
final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
// this interval must exist since it passed the rowFilter
assert optInterval.isPresent();
interval = optInterval.get();
}
if (partitionsSpec.needsDeterminePartitions(false)) {
hllCollectors.computeIfAbsent(interval, intv -> Optional.of(HyperLogLogCollector.makeLatestCollector()));
List<Object> groupKey = Rows.toGroupKey(queryGranularity.bucketStart(inputRow.getTimestampFromEpoch()), inputRow);
hllCollectors.get(interval).get().add(HASH_FUNCTION.hashBytes(jsonMapper.writeValueAsBytes(groupKey)).asBytes());
} else {
// we don't need to determine partitions but we still need to determine intervals, so add an Optional.absent()
// for the interval and don't instantiate a HLL collector
hllCollectors.putIfAbsent(interval, Optional.absent());
}
determinePartitionsMeters.incrementProcessed();
}
}
// These metrics are reported in generateAndPublishSegments()
if (determinePartitionsMeters.getThrownAway() > 0) {
log.warn("Unable to find a matching interval for [%,d] events", determinePartitionsMeters.getThrownAway());
}
if (determinePartitionsMeters.getUnparseable() > 0) {
log.warn("Unable to parse [%,d] events", determinePartitionsMeters.getUnparseable());
}
return hllCollectors;
}
use of org.apache.druid.java.util.common.parsers.CloseableIterator in project druid by druid-io.
the class OrcReader method intermediateRowIterator.
@Override
protected CloseableIterator<OrcStruct> intermediateRowIterator() throws IOException {
final Closer closer = Closer.create();
// We fetch here to cache a copy locally. However, this might need to be changed if we want to split an orc file
// into several InputSplits in the future.
final byte[] buffer = new byte[InputEntity.DEFAULT_FETCH_BUFFER_SIZE];
final CleanableFile file = closer.register(source.fetch(temporaryDirectory, buffer));
final Path path = new Path(file.file().toURI());
final ClassLoader currentClassLoader = Thread.currentThread().getContextClassLoader();
final Reader reader;
try {
Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
reader = closer.register(OrcFile.createReader(path, OrcFile.readerOptions(conf)));
} finally {
Thread.currentThread().setContextClassLoader(currentClassLoader);
}
// The below line will get the schmea to read the whole columns.
// This can be improved by projecting some columns only what users want in the future.
final TypeDescription schema = reader.getSchema();
final RecordReader batchReader = reader.rows(reader.options());
final OrcMapredRecordReader<OrcStruct> recordReader = new OrcMapredRecordReader<>(batchReader, schema);
closer.register(recordReader::close);
return new CloseableIterator<OrcStruct>() {
final NullWritable key = recordReader.createKey();
OrcStruct value = null;
@Override
public boolean hasNext() {
if (value == null) {
try {
// The returned OrcStruct in next() can be kept in memory for a while.
// Here, we create a new instance of OrcStruct before calling RecordReader.next(),
// so that we can avoid to share the same reference to the "value" across rows.
value = recordReader.createValue();
if (!recordReader.next(key, value)) {
value = null;
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
return value != null;
}
@Override
public OrcStruct next() {
if (value == null) {
throw new NoSuchElementException();
}
final OrcStruct currentValue = value;
value = null;
return currentValue;
}
@Override
public void close() throws IOException {
closer.close();
}
};
}
use of org.apache.druid.java.util.common.parsers.CloseableIterator in project druid by druid-io.
the class DruidSegmentReader method intermediateRowIterator.
@Override
protected CloseableIterator<Map<String, Object>> intermediateRowIterator() throws IOException {
final CleanableFile segmentFile = source.fetch(temporaryDirectory, null);
final WindowedStorageAdapter storageAdapter = new WindowedStorageAdapter(new QueryableIndexStorageAdapter(indexIO.loadIndex(segmentFile.file())), source.getIntervalFilter());
final Sequence<Cursor> cursors = storageAdapter.getAdapter().makeCursors(Filters.toFilter(dimFilter), storageAdapter.getInterval(), VirtualColumns.EMPTY, Granularities.ALL, false, null);
// Retain order of columns from the original segments. Useful for preserving dimension order if we're in
// schemaless mode.
final Set<String> columnsToRead = Sets.newLinkedHashSet(Iterables.filter(storageAdapter.getAdapter().getRowSignature().getColumnNames(), columnsFilter::apply));
final Sequence<Map<String, Object>> sequence = Sequences.concat(Sequences.map(cursors, cursor -> cursorToSequence(cursor, columnsToRead)));
return makeCloseableIteratorFromSequenceAndSegmentFile(sequence, segmentFile);
}
Aggregations