use of org.apache.drill.common.exceptions.ExecutionSetupException in project drill by axbaretto.
the class SequenceFileRecordReader method setup.
@Override
public void setup(OperatorContext context, OutputMutator output) throws ExecutionSetupException {
final SequenceFileAsBinaryInputFormat inputFormat = new SequenceFileAsBinaryInputFormat();
final JobConf jobConf = new JobConf(dfs.getConf());
jobConf.setInputFormat(inputFormat.getClass());
reader = getRecordReader(inputFormat, jobConf);
final MaterializedField keyField = MaterializedField.create(keySchema, KEY_TYPE);
final MaterializedField valueField = MaterializedField.create(valueSchema, VALUE_TYPE);
try {
keyVector = output.addField(keyField, NullableVarBinaryVector.class);
valueVector = output.addField(valueField, NullableVarBinaryVector.class);
} catch (SchemaChangeException sce) {
throw new ExecutionSetupException("Error in setting up sequencefile reader.", sce);
}
}
use of org.apache.drill.common.exceptions.ExecutionSetupException in project drill by axbaretto.
the class CompliantTextRecordReader method setup.
/**
* Performs the initial setup required for the record reader.
* Initializes the input stream, handling of the output record batch
* and the actual reader to be used.
* @param context operator context from which buffer's will be allocated and managed
* @param outputMutator Used to create the schema in the output record batch
* @throws ExecutionSetupException
*/
@SuppressWarnings("resource")
@Override
public void setup(OperatorContext context, OutputMutator outputMutator) throws ExecutionSetupException {
oContext = context;
// Note: DO NOT use managed buffers here. They remain in existence
// until the fragment is shut down. The buffers here are large.
// If we scan 1000 files, and allocate 1 MB for each, we end up
// holding onto 1 GB of memory in managed buffers.
// Instead, we allocate the buffers explicitly, and must free
// them.
// readBuffer = context.getManagedBuffer(READ_BUFFER);
// whitespaceBuffer = context.getManagedBuffer(WHITE_SPACE_BUFFER);
readBuffer = context.getAllocator().buffer(READ_BUFFER);
whitespaceBuffer = context.getAllocator().buffer(WHITE_SPACE_BUFFER);
// setup Output, Input, and Reader
try {
TextOutput output = null;
TextInput input = null;
InputStream stream = null;
// setup Output using OutputMutator
if (settings.isHeaderExtractionEnabled()) {
// extract header and use that to setup a set of VarCharVectors
String[] fieldNames = extractHeader();
output = new FieldVarCharOutput(outputMutator, fieldNames, getColumns(), isStarQuery());
} else {
// simply use RepeatedVarCharVector
output = new RepeatedVarCharOutput(outputMutator, getColumns(), isStarQuery());
}
// setup Input using InputStream
logger.trace("Opening file {}", split.getPath());
stream = dfs.openPossiblyCompressedStream(split.getPath());
input = new TextInput(settings, stream, readBuffer, split.getStart(), split.getStart() + split.getLength());
// setup Reader using Input and Output
reader = new TextReader(settings, input, output, whitespaceBuffer);
reader.start();
} catch (SchemaChangeException | IOException e) {
throw new ExecutionSetupException(String.format("Failure while setting up text reader for file %s", split.getPath()), e);
} catch (IllegalArgumentException e) {
throw UserException.dataReadError(e).addContext("File Path", split.getPath().toString()).build(logger);
}
}
use of org.apache.drill.common.exceptions.ExecutionSetupException in project drill by axbaretto.
the class JSONRecordReader method setup.
@Override
public void setup(final OperatorContext context, final OutputMutator output) throws ExecutionSetupException {
try {
if (hadoopPath != null) {
this.stream = fileSystem.openPossiblyCompressedStream(hadoopPath);
}
this.writer = new VectorContainerWriter(output, unionEnabled);
if (isSkipQuery()) {
this.jsonReader = new CountingJsonReader(fragmentContext.getManagedBuffer(), enableNanInf);
} else {
this.jsonReader = new JsonReader.Builder(fragmentContext.getManagedBuffer()).schemaPathColumns(ImmutableList.copyOf(getColumns())).allTextMode(enableAllTextMode).skipOuterList(true).readNumbersAsDouble(readNumbersAsDouble).enableNanInf(enableNanInf).build();
}
setupParser();
} catch (final Exception e) {
handleAndRaise("Failure reading JSON file", e);
}
}
use of org.apache.drill.common.exceptions.ExecutionSetupException in project drill by axbaretto.
the class ParquetScanBatchCreator method getBatch.
@SuppressWarnings("resource")
@Override
public ScanBatch getBatch(ExecutorFragmentContext context, ParquetRowGroupScan rowGroupScan, List<RecordBatch> children) throws ExecutionSetupException {
Preconditions.checkArgument(children.isEmpty());
OperatorContext oContext = context.newOperatorContext(rowGroupScan);
final ColumnExplorer columnExplorer = new ColumnExplorer(context.getOptions(), rowGroupScan.getColumns());
if (!columnExplorer.isStarQuery()) {
rowGroupScan = new ParquetRowGroupScan(rowGroupScan.getUserName(), rowGroupScan.getStorageEngine(), rowGroupScan.getRowGroupReadEntries(), columnExplorer.getTableColumns(), rowGroupScan.getSelectionRoot(), rowGroupScan.getFilter());
rowGroupScan.setOperatorId(rowGroupScan.getOperatorId());
}
DrillFileSystem fs;
try {
boolean useAsyncPageReader = context.getOptions().getOption(ExecConstants.PARQUET_PAGEREADER_ASYNC).bool_val;
if (useAsyncPageReader) {
fs = oContext.newNonTrackingFileSystem(rowGroupScan.getStorageEngine().getFsConf());
} else {
fs = oContext.newFileSystem(rowGroupScan.getStorageEngine().getFsConf());
}
} catch (IOException e) {
throw new ExecutionSetupException(String.format("Failed to create DrillFileSystem: %s", e.getMessage()), e);
}
Configuration conf = new Configuration(fs.getConf());
conf.setBoolean(ENABLE_BYTES_READ_COUNTER, false);
conf.setBoolean(ENABLE_BYTES_TOTAL_COUNTER, false);
conf.setBoolean(ENABLE_TIME_READ_COUNTER, false);
// keep footers in a map to avoid re-reading them
Map<String, ParquetMetadata> footers = Maps.newHashMap();
List<RecordReader> readers = new LinkedList<>();
List<Map<String, String>> implicitColumns = Lists.newArrayList();
Map<String, String> mapWithMaxColumns = Maps.newLinkedHashMap();
for (RowGroupReadEntry e : rowGroupScan.getRowGroupReadEntries()) {
/*
Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file
TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine)
we should add more information to the RowGroupInfo that will be populated upon the first read to
provide the reader with all of th file meta-data it needs
These fields will be added to the constructor below
*/
try {
Stopwatch timer = Stopwatch.createUnstarted();
if (!footers.containsKey(e.getPath())) {
timer.start();
ParquetMetadata footer = ParquetFileReader.readFooter(conf, new Path(e.getPath()));
long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS);
logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", e.getPath(), "", 0, 0, 0, timeToRead);
footers.put(e.getPath(), footer);
}
boolean autoCorrectCorruptDates = rowGroupScan.getFormatConfig().areCorruptDatesAutoCorrected();
ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footers.get(e.getPath()), rowGroupScan.getColumns(), autoCorrectCorruptDates);
if (logger.isDebugEnabled()) {
logger.debug(containsCorruptDates.toString());
}
if (!context.getOptions().getBoolean(ExecConstants.PARQUET_NEW_RECORD_READER) && !isComplex(footers.get(e.getPath()))) {
readers.add(new ParquetRecordReader(context, e.getPath(), e.getRowGroupIndex(), e.getNumRecordsToRead(), fs, CodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(oContext.getAllocator()), 0), footers.get(e.getPath()), rowGroupScan.getColumns(), containsCorruptDates));
} else {
ParquetMetadata footer = footers.get(e.getPath());
readers.add(new DrillParquetReader(context, footer, e, columnExplorer.getTableColumns(), fs, containsCorruptDates));
}
Map<String, String> implicitValues = columnExplorer.populateImplicitColumns(e, rowGroupScan.getSelectionRoot());
implicitColumns.add(implicitValues);
if (implicitValues.size() > mapWithMaxColumns.size()) {
mapWithMaxColumns = implicitValues;
}
} catch (IOException e1) {
throw new ExecutionSetupException(e1);
}
}
// all readers should have the same number of implicit columns, add missing ones with value null
Map<String, String> diff = Maps.transformValues(mapWithMaxColumns, Functions.constant((String) null));
for (Map<String, String> map : implicitColumns) {
map.putAll(Maps.difference(map, diff).entriesOnlyOnRight());
}
return new ScanBatch(context, oContext, readers, implicitColumns);
}
use of org.apache.drill.common.exceptions.ExecutionSetupException in project drill by axbaretto.
the class ExtendedMockRecordReader method setup.
@Override
public void setup(OperatorContext context, OutputMutator output) throws ExecutionSetupException {
try {
final int estimateRowSize = getEstimatedRecordSize();
valueVectors = new ValueVector[fields.length];
int batchSize = config.getBatchSize();
if (batchSize == 0) {
batchSize = 10 * 1024 * 1024;
}
batchRecordCount = Math.max(1, batchSize / estimateRowSize);
batchRecordCount = Math.min(batchRecordCount, Character.MAX_VALUE);
for (int i = 0; i < fields.length; i++) {
final ColumnDef col = fields[i];
final MajorType type = col.getConfig().getMajorType();
final MaterializedField field = MaterializedField.create(col.getName(), type);
final Class<? extends ValueVector> vvClass = TypeHelper.getValueVectorClass(field.getType().getMinorType(), field.getDataMode());
valueVectors[i] = output.addField(field, vvClass);
}
} catch (SchemaChangeException e) {
throw new ExecutionSetupException("Failure while setting up fields", e);
}
}
Aggregations