use of org.apache.parquet.column.ColumnDescriptor in project drill by axbaretto.
the class DrillParquetReader method getProjection.
public static MessageType getProjection(MessageType schema, Collection<SchemaPath> columns, List<SchemaPath> columnsNotFound) {
MessageType projection = null;
String messageName = schema.getName();
List<ColumnDescriptor> schemaColumns = schema.getColumns();
// parquet type.union() seems to lose ConvertedType info when merging two columns that are the same type. This can
// happen when selecting two elements from an array. So to work around this, we use set of SchemaPath to avoid duplicates
// and then merge the types at the end
Set<SchemaPath> selectedSchemaPaths = Sets.newLinkedHashSet();
// get a list of modified columns which have the array elements removed from the schema path since parquet schema doesn't include array elements
List<SchemaPath> modifiedColumns = Lists.newLinkedList();
for (SchemaPath path : columns) {
List<String> segments = Lists.newArrayList();
PathSegment seg = path.getRootSegment();
do {
if (seg.isNamed()) {
segments.add(seg.getNameSegment().getPath());
}
} while ((seg = seg.getChild()) != null);
String[] pathSegments = new String[segments.size()];
segments.toArray(pathSegments);
SchemaPath modifiedSchemaPath = SchemaPath.getCompoundPath(pathSegments);
modifiedColumns.add(modifiedSchemaPath);
}
// convert the columns in the parquet schema to a list of SchemaPath columns so that they can be compared in case insensitive manner
// to the projection columns
List<SchemaPath> schemaPaths = Lists.newLinkedList();
for (ColumnDescriptor columnDescriptor : schemaColumns) {
String[] schemaColDesc = Arrays.copyOf(columnDescriptor.getPath(), columnDescriptor.getPath().length);
SchemaPath schemaPath = SchemaPath.getCompoundPath(schemaColDesc);
schemaPaths.add(schemaPath);
}
// loop through projection columns and add any columns that are missing from parquet schema to columnsNotFound list
for (SchemaPath columnPath : modifiedColumns) {
boolean notFound = true;
for (SchemaPath schemaPath : schemaPaths) {
if (schemaPath.contains(columnPath)) {
selectedSchemaPaths.add(schemaPath);
notFound = false;
}
}
if (notFound) {
columnsNotFound.add(columnPath);
}
}
// convert SchemaPaths from selectedSchemaPaths and convert to parquet type, and merge into projection schema
for (SchemaPath schemaPath : selectedSchemaPaths) {
List<String> segments = Lists.newArrayList();
PathSegment seg = schemaPath.getRootSegment();
do {
segments.add(seg.getNameSegment().getPath());
} while ((seg = seg.getChild()) != null);
String[] pathSegments = new String[segments.size()];
segments.toArray(pathSegments);
Type t = getType(pathSegments, 0, schema);
if (projection == null) {
projection = new MessageType(messageName, t);
} else {
projection = projection.union(new MessageType(messageName, t));
}
}
return projection;
}
use of org.apache.parquet.column.ColumnDescriptor in project drill by axbaretto.
the class ReadState method buildReader.
/**
* Create the readers needed to read columns: fixed-length or variable length.
*
* @param reader
* @param output
* @throws Exception
*/
@SuppressWarnings("unchecked")
public void buildReader(ParquetRecordReader reader, OutputMutator output) throws Exception {
final ArrayList<VarLengthColumn<? extends ValueVector>> varLengthColumns = new ArrayList<>();
// initialize all of the column read status objects
BlockMetaData rowGroupMetadata = schema.getRowGroupMetadata();
Map<String, Integer> columnChunkMetadataPositionsInList = schema.buildChunkMap(rowGroupMetadata);
for (ParquetColumnMetadata columnMetadata : schema.getColumnMetadata()) {
ColumnDescriptor column = columnMetadata.column;
columnMetadata.columnChunkMetaData = rowGroupMetadata.getColumns().get(columnChunkMetadataPositionsInList.get(Arrays.toString(column.getPath())));
columnMetadata.buildVector(output);
if (!columnMetadata.isFixedLength()) {
// create a reader and add it to the appropriate list
varLengthColumns.add(columnMetadata.makeVariableWidthReader(reader));
} else if (columnMetadata.isRepeated()) {
varLengthColumns.add(columnMetadata.makeRepeatedFixedWidthReader(reader, schema.getRecordsPerBatch()));
} else {
columnReaders.add(columnMetadata.makeFixedWidthReader(reader, schema.getRecordsPerBatch()));
}
}
varLengthReader = new VarLenBinaryReader(reader, varLengthColumns);
if (!schema.isStarQuery()) {
schema.createNonExistentColumns(output, nullFilledVectors);
}
}
use of org.apache.parquet.column.ColumnDescriptor in project drill by axbaretto.
the class ParquetFooterStatCollector method collectColStat.
@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
Stopwatch timer = Stopwatch.createStarted();
ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, new ArrayList<>(fields), autoCorrectCorruptDates);
// map from column name to ColumnDescriptor
Map<SchemaPath, ColumnDescriptor> columnDescMap = new HashMap<>();
// map from column name to ColumnChunkMetaData
final Map<SchemaPath, ColumnChunkMetaData> columnChkMetaMap = new HashMap<>();
// map from column name to MajorType
final Map<SchemaPath, TypeProtos.MajorType> columnTypeMap = new HashMap<>();
// map from column name to SchemaElement
final Map<SchemaPath, SchemaElement> schemaElementMap = new HashMap<>();
// map from column name to column statistics.
final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
final org.apache.parquet.format.FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
for (final ColumnDescriptor column : footer.getFileMetaData().getSchema().getColumns()) {
final SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getPath());
if (fields.contains(schemaPath)) {
columnDescMap.put(schemaPath, column);
}
}
for (final SchemaElement se : fileMetaData.getSchema()) {
final SchemaPath schemaPath = SchemaPath.getSimplePath(se.getName());
if (fields.contains(schemaPath)) {
schemaElementMap.put(schemaPath, se);
}
}
for (final ColumnChunkMetaData colMetaData : footer.getBlocks().get(rowGroupIndex).getColumns()) {
final SchemaPath schemaPath = SchemaPath.getCompoundPath(colMetaData.getPath().toArray());
if (fields.contains(schemaPath)) {
columnChkMetaMap.put(schemaPath, colMetaData);
}
}
for (final SchemaPath path : fields) {
if (columnDescMap.containsKey(path) && schemaElementMap.containsKey(path) && columnChkMetaMap.containsKey(path)) {
ColumnDescriptor columnDesc = columnDescMap.get(path);
SchemaElement se = schemaElementMap.get(path);
ColumnChunkMetaData metaData = columnChkMetaMap.get(path);
TypeProtos.MajorType type = ParquetToDrillTypeConverter.toMajorType(columnDesc.getType(), se.getType_length(), getDataMode(columnDesc), se, options);
columnTypeMap.put(path, type);
Statistics stat = metaData.getStatistics();
if (type.getMinorType() == TypeProtos.MinorType.DATE) {
stat = convertDateStatIfNecessary(metaData.getStatistics(), containsCorruptDates);
}
statMap.put(path, new ColumnStatistics(stat, type));
} else {
final String columnName = path.getRootSegment().getPath();
if (implicitColValues.containsKey(columnName)) {
TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
Statistics stat = new BinaryStatistics();
stat.setNumNulls(0);
byte[] val = implicitColValues.get(columnName).getBytes();
stat.setMinMaxFromBytes(val, val);
statMap.put(path, new ColumnStatistics(stat, type));
}
}
}
if (logger.isDebugEnabled()) {
logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
}
return statMap;
}
use of org.apache.parquet.column.ColumnDescriptor in project drill by axbaretto.
the class TestFileGenerator method generateParquetFile.
public static void generateParquetFile(String filename, ParquetTestProperties props) throws Exception {
int currentBooleanByte = 0;
WrapAroundCounter booleanBitCounter = new WrapAroundCounter(7);
Configuration configuration = new Configuration();
configuration.set(FileSystem.FS_DEFAULT_NAME_KEY, "file:///");
// "message m { required int32 integer; required int64 integer64; required boolean b; required float f; required double d;}"
FileSystem fs = FileSystem.get(configuration);
Path path = new Path(filename);
if (fs.exists(path)) {
fs.delete(path, false);
}
String messageSchema = "message m {";
for (FieldInfo fieldInfo : props.fields.values()) {
messageSchema += " required " + fieldInfo.parquetType + " " + fieldInfo.name + ";";
}
// remove the last semicolon, java really needs a join method for strings...
// TODO - nvm apparently it requires a semicolon after every field decl, might want to file a bug
// messageSchema = messageSchema.substring(schemaType, messageSchema.length() - 1);
messageSchema += "}";
MessageType schema = MessageTypeParser.parseMessageType(messageSchema);
CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
w.start();
HashMap<String, Integer> columnValuesWritten = new HashMap<>();
int valsWritten;
for (int k = 0; k < props.numberRowGroups; k++) {
w.startBlock(props.recordsPerRowGroup);
currentBooleanByte = 0;
booleanBitCounter.reset();
for (FieldInfo fieldInfo : props.fields.values()) {
if (!columnValuesWritten.containsKey(fieldInfo.name)) {
columnValuesWritten.put(fieldInfo.name, 0);
valsWritten = 0;
} else {
valsWritten = columnValuesWritten.get(fieldInfo.name);
}
String[] path1 = { fieldInfo.name };
ColumnDescriptor c1 = schema.getColumnDescription(path1);
w.startColumn(c1, props.recordsPerRowGroup, codec);
final int valsPerPage = (int) Math.ceil(props.recordsPerRowGroup / (float) fieldInfo.numberOfPages);
// 1 MB
final int PAGE_SIZE = 1024 * 1024;
byte[] bytes;
RunLengthBitPackingHybridValuesWriter defLevels = new RunLengthBitPackingHybridValuesWriter(MAX_EXPECTED_BIT_WIDTH_FOR_DEFINITION_LEVELS, valsPerPage, PAGE_SIZE, new DirectByteBufferAllocator());
RunLengthBitPackingHybridValuesWriter repLevels = new RunLengthBitPackingHybridValuesWriter(MAX_EXPECTED_BIT_WIDTH_FOR_DEFINITION_LEVELS, valsPerPage, PAGE_SIZE, new DirectByteBufferAllocator());
// for variable length binary fields
int bytesNeededToEncodeLength = 4;
if (fieldInfo.bitLength > 0) {
bytes = new byte[(int) Math.ceil(valsPerPage * fieldInfo.bitLength / 8.0)];
} else {
// the twelve at the end is to account for storing a 4 byte length with each value
int totalValLength = ((byte[]) fieldInfo.values[0]).length + ((byte[]) fieldInfo.values[1]).length + ((byte[]) fieldInfo.values[2]).length + 3 * bytesNeededToEncodeLength;
// used for the case where there is a number of values in this row group that is not divisible by 3
int leftOverBytes = 0;
if (valsPerPage % 3 > 0) {
leftOverBytes += ((byte[]) fieldInfo.values[1]).length + bytesNeededToEncodeLength;
}
if (valsPerPage % 3 > 1) {
leftOverBytes += ((byte[]) fieldInfo.values[2]).length + bytesNeededToEncodeLength;
}
bytes = new byte[valsPerPage / 3 * totalValLength + leftOverBytes];
}
int bytesPerPage = (int) (valsPerPage * (fieldInfo.bitLength / 8.0));
int bytesWritten = 0;
for (int z = 0; z < fieldInfo.numberOfPages; z++, bytesWritten = 0) {
for (int i = 0; i < valsPerPage; i++) {
repLevels.writeInteger(0);
defLevels.writeInteger(1);
// System.out.print(i + ", " + (i % 25 == 0 ? "\n gen " + fieldInfo.name + ": " : ""));
if (fieldInfo.values[0] instanceof Boolean) {
bytes[currentBooleanByte] |= bitFields[booleanBitCounter.val] & ((boolean) fieldInfo.values[valsWritten % 3] ? allBitsTrue : allBitsFalse);
booleanBitCounter.increment();
if (booleanBitCounter.val == 0) {
currentBooleanByte++;
}
valsWritten++;
if (currentBooleanByte > bytesPerPage) {
break;
}
} else {
if (fieldInfo.values[valsWritten % 3] instanceof byte[]) {
System.arraycopy(ByteArrayUtil.toByta(((byte[]) fieldInfo.values[valsWritten % 3]).length), 0, bytes, bytesWritten, bytesNeededToEncodeLength);
System.arraycopy(fieldInfo.values[valsWritten % 3], 0, bytes, bytesWritten + bytesNeededToEncodeLength, ((byte[]) fieldInfo.values[valsWritten % 3]).length);
bytesWritten += ((byte[]) fieldInfo.values[valsWritten % 3]).length + bytesNeededToEncodeLength;
} else {
System.arraycopy(ByteArrayUtil.toByta(fieldInfo.values[valsWritten % 3]), 0, bytes, i * (fieldInfo.bitLength / 8), fieldInfo.bitLength / 8);
}
valsWritten++;
}
}
byte[] fullPage = new byte[2 * 4 * valsPerPage + bytes.length];
byte[] repLevelBytes = repLevels.getBytes().toByteArray();
byte[] defLevelBytes = defLevels.getBytes().toByteArray();
System.arraycopy(bytes, 0, fullPage, 0, bytes.length);
System.arraycopy(repLevelBytes, 0, fullPage, bytes.length, repLevelBytes.length);
System.arraycopy(defLevelBytes, 0, fullPage, bytes.length + repLevelBytes.length, defLevelBytes.length);
w.writeDataPage((props.recordsPerRowGroup / fieldInfo.numberOfPages), fullPage.length, BytesInput.from(fullPage), RLE, RLE, PLAIN);
currentBooleanByte = 0;
}
w.endColumn();
columnValuesWritten.remove(fieldInfo.name);
columnValuesWritten.put(fieldInfo.name, valsWritten);
}
w.endBlock();
}
w.end(new HashMap<String, String>());
logger.debug("Finished generating parquet file {}", path.getName());
}
use of org.apache.parquet.column.ColumnDescriptor in project hive by apache.
the class VectorizedParquetRecordReader method checkEndOfRowGroup.
private void checkEndOfRowGroup() throws IOException {
if (rowsReturned != totalCountLoadedSoFar) {
return;
}
PageReadStore pages = reader.readNextRowGroup();
if (pages == null) {
throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount);
}
List<ColumnDescriptor> columns = requestedSchema.getColumns();
List<Type> types = requestedSchema.getFields();
columnReaders = new VectorizedColumnReader[columns.size()];
if (!ColumnProjectionUtils.isReadAllColumns(jobConf)) {
// However, if colsToInclude is not empty we should initialize each columnReader
if (!colsToInclude.isEmpty()) {
for (int i = 0; i < types.size(); ++i) {
columnReaders[i] = buildVectorizedParquetReader(columnTypesList.get(colsToInclude.get(i)), types.get(i), pages, requestedSchema.getColumns(), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, 0);
}
}
} else {
for (int i = 0; i < types.size(); ++i) {
columnReaders[i] = buildVectorizedParquetReader(columnTypesList.get(i), types.get(i), pages, requestedSchema.getColumns(), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, 0);
}
}
totalCountLoadedSoFar += pages.getRowCount();
}
Aggregations