use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4 in project drill by apache.
the class ParquetTableMetadataUtils method getIntermediateFields.
/**
* Returns map of column names with their Drill types for every {@code NameSegment} in {@code SchemaPath}
* in specified {@code rowGroup}. The type for a {@code SchemaPath} can be {@code null} in case when
* it is not possible to determine its type. Actually, as of now this hierarchy is of interest solely
* because there is a need to account for {@link org.apache.drill.common.types.TypeProtos.MinorType#DICT}
* to make sure filters used on {@code DICT}'s values (get by key) are not pruned out before actual filtering
* happens.
*
* @param parquetTableMetadata the source of column types
* @param rowGroup row group whose columns should be discovered
* @return map of column names with their drill types
*/
public static Map<SchemaPath, TypeProtos.MajorType> getIntermediateFields(MetadataBase.ParquetTableMetadataBase parquetTableMetadata, MetadataBase.RowGroupMetadata rowGroup) {
Map<SchemaPath, TypeProtos.MajorType> columns = new LinkedHashMap<>();
MetadataVersion metadataVersion = new MetadataVersion(parquetTableMetadata.getMetadataVersion());
boolean hasParentTypes = metadataVersion.isAtLeast(4, 1);
if (!hasParentTypes) {
return Collections.emptyMap();
}
for (MetadataBase.ColumnMetadata column : rowGroup.getColumns()) {
Metadata_V4.ColumnTypeMetadata_v4 columnTypeMetadata = ((Metadata_V4.ParquetTableMetadata_v4) parquetTableMetadata).getColumnTypeInfo(column.getName());
List<OriginalType> parentTypes = columnTypeMetadata.parentTypes;
List<TypeProtos.MajorType> drillTypes = ParquetReaderUtility.getComplexTypes(parentTypes);
for (int i = 0; i < drillTypes.size(); i++) {
SchemaPath columnPath = SchemaPath.getCompoundPath(i + 1, column.getName());
TypeProtos.MajorType drillType = drillTypes.get(i);
putType(columns, columnPath, drillType);
}
}
return columns;
}
use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4 in project drill by apache.
the class ConvertCountToDirectScanRule method collectCounts.
/**
* Collects counts for each aggregation call by using the metadata summary information
* Will return empty result map if was not able to determine count for at least one aggregation call.
*
* For each aggregate call will determine if count can be calculated. Collects counts only for COUNT function.
* 1. First, we get the total row count from the metadata summary.
* 2. For COUNT(*) and COUNT(<non null column>) and COUNT(<implicit column>), the count = total row count
* 3. For COUNT(nullable column), count = (total row count - column's null count)
* 4. Also count can not be calculated for parition columns.
* 5. For the columns that are not present in the Summary(Non-existent columns), the count = 0
*
* @param settings planner options
* @param metadataSummary metadata summary containing row counts and column counts
* @param agg aggregate relational expression
* @param scan scan relational expression
* @param project project relational expression
* @return result map where key is count column name, value is count value
*/
private Map<String, Long> collectCounts(PlannerSettings settings, Metadata_V4.MetadataSummary metadataSummary, Aggregate agg, TableScan scan, Project project) {
final Set<String> implicitColumnsNames = ColumnExplorer.initImplicitFileColumns(settings.getOptions()).keySet();
final long totalRecordCount = metadataSummary.getTotalRowCount();
final LinkedHashMap<String, Long> result = new LinkedHashMap<>();
for (int i = 0; i < agg.getAggCallList().size(); i++) {
AggregateCall aggCall = agg.getAggCallList().get(i);
long cnt;
// rule can be applied only for count function, return empty counts
if (!"count".equalsIgnoreCase(aggCall.getAggregation().getName())) {
return ImmutableMap.of();
}
if (CountToDirectScanUtils.containsStarOrNotNullInput(aggCall, agg)) {
cnt = totalRecordCount;
} else if (aggCall.getArgList().size() == 1) {
// count(columnName) ==> Agg ( Scan )) ==> columnValueCount
int index = aggCall.getArgList().get(0);
if (project != null) {
// return count of "col2" in Scan's metadata, if found.
if (!(project.getProjects().get(index) instanceof RexInputRef)) {
// do not apply for all other cases.
return ImmutableMap.of();
}
index = ((RexInputRef) project.getProjects().get(index)).getIndex();
}
String columnName = scan.getRowType().getFieldNames().get(index).toLowerCase();
// for implicit column count will be the same as total record count
if (implicitColumnsNames.contains(columnName)) {
cnt = totalRecordCount;
} else {
SchemaPath simplePath = SchemaPath.getSimplePath(columnName);
if (ColumnExplorer.isPartitionColumn(settings.getOptions(), simplePath)) {
return ImmutableMap.of();
}
Metadata_V4.ColumnTypeMetadata_v4 columnMetadata = metadataSummary.getColumnTypeInfo(new Metadata_V4.ColumnTypeMetadata_v4.Key(simplePath));
if (columnMetadata == null) {
// If the column doesn't exist in the table, row count is set to 0
cnt = 0;
} else if (columnMetadata.totalNullCount == Statistic.NO_COLUMN_STATS) {
// if column stats is not available don't apply this rule, return empty counts
return ImmutableMap.of();
} else {
// count of a nullable column = (total row count - column's null count)
cnt = totalRecordCount - columnMetadata.totalNullCount;
}
}
} else {
return ImmutableMap.of();
}
String name = "count" + i + "$" + (aggCall.getName() == null ? aggCall.toString() : aggCall.getName());
result.put(name, cnt);
}
return ImmutableMap.copyOf(result);
}
use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4 in project drill by apache.
the class Metadata method readBlockMeta.
/**
* Read the parquet metadata from a file
*
* @param path to metadata file
* @param dirsOnly true for {@link Metadata#METADATA_DIRECTORIES_FILENAME}
* or false for {@link Metadata#OLD_METADATA_FILENAME} files reading
* @param metaContext current metadata context
*/
private void readBlockMeta(Path path, boolean dirsOnly, MetadataContext metaContext, FileSystem fs) {
Stopwatch timer = logger.isDebugEnabled() ? Stopwatch.createStarted() : null;
Path metadataParentDir = Path.getPathWithoutSchemeAndAuthority(path.getParent());
String metadataParentDirPath = metadataParentDir.toUri().getPath();
ObjectMapper mapper = new ObjectMapper();
final SimpleModule serialModule = new SimpleModule();
serialModule.addDeserializer(SchemaPath.class, new SchemaPath.De());
serialModule.addKeyDeserializer(Metadata_V2.ColumnTypeMetadata_v2.Key.class, new Metadata_V2.ColumnTypeMetadata_v2.Key.DeSerializer());
serialModule.addKeyDeserializer(Metadata_V3.ColumnTypeMetadata_v3.Key.class, new Metadata_V3.ColumnTypeMetadata_v3.Key.DeSerializer());
serialModule.addKeyDeserializer(ColumnTypeMetadata_v4.Key.class, new ColumnTypeMetadata_v4.Key.DeSerializer());
AfterburnerModule module = new AfterburnerModule();
module.setUseOptimizedBeanDeserializer(true);
boolean isFileMetadata = path.toString().endsWith(METADATA_FILENAME);
boolean isSummaryFile = path.toString().endsWith(METADATA_SUMMARY_FILENAME);
mapper.registerModule(serialModule);
mapper.registerModule(module);
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
try (InputStream is = fs.open(path)) {
boolean alreadyCheckedModification;
boolean newMetadata = false;
alreadyCheckedModification = metaContext.getStatus(metadataParentDirPath);
if (dirsOnly) {
parquetTableMetadataDirs = mapper.readValue(is, ParquetTableMetadataDirs.class);
if (timer != null) {
logger.debug("Took {} ms to read directories from directory cache file", timer.elapsed(TimeUnit.MILLISECONDS));
timer.stop();
}
parquetTableMetadataDirs.updateRelativePaths(metadataParentDirPath);
if (!alreadyCheckedModification && tableModified(parquetTableMetadataDirs.getDirectories(), path, metadataParentDir, metaContext, fs)) {
parquetTableMetadataDirs = (createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()), fs, true, null, true)).getRight();
newMetadata = true;
}
} else {
if (isFileMetadata) {
parquetTableMetadata.assignFiles((mapper.readValue(is, FileMetadata.class)).getFiles());
if (new MetadataVersion(parquetTableMetadata.getMetadataVersion()).isAtLeast(4, 0)) {
((ParquetTableMetadata_v4) parquetTableMetadata).updateRelativePaths(metadataParentDirPath);
}
if (!alreadyCheckedModification && tableModified(parquetTableMetadata.getDirectories(), path, metadataParentDir, metaContext, fs)) {
parquetTableMetadata = (createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()), fs, true, null, true)).getLeft();
newMetadata = true;
}
} else if (isSummaryFile) {
MetadataSummary metadataSummary = mapper.readValue(is, Metadata_V4.MetadataSummary.class);
parquetTableMetadata = new ParquetTableMetadata_v4(metadataSummary);
} else {
parquetTableMetadata = mapper.readValue(is, ParquetTableMetadataBase.class);
if (new MetadataVersion(parquetTableMetadata.getMetadataVersion()).isAtLeast(3, 0)) {
((Metadata_V3.ParquetTableMetadata_v3) parquetTableMetadata).updateRelativePaths(metadataParentDirPath);
}
if (!alreadyCheckedModification && tableModified((parquetTableMetadata.getDirectories()), path, metadataParentDir, metaContext, fs)) {
parquetTableMetadata = (createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(path.getParent()), fs, true, null, true)).getLeft();
newMetadata = true;
}
}
if (timer != null) {
logger.debug("Took {} ms to read metadata from cache file", timer.elapsed(TimeUnit.MILLISECONDS));
timer.stop();
}
if (!isSummaryFile) {
List<? extends ParquetFileMetadata> files = parquetTableMetadata.getFiles();
if (files != null) {
for (ParquetFileMetadata file : files) {
// DRILL-5009: Remove empty row groups unless it is the only row group
List<? extends RowGroupMetadata> rowGroups = file.getRowGroups();
if (rowGroups.size() == 1) {
continue;
}
rowGroups.removeIf(r -> r.getRowCount() == 0);
}
}
}
if (newMetadata) {
// if new metadata files were created, invalidate the existing metadata context
metaContext.clear();
}
}
} catch (IOException e) {
logger.error("Failed to read '{}' metadata file", path, e);
metaContext.setMetadataCacheCorrupted(true);
}
}
use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4 in project drill by apache.
the class Metadata method createMetaFilesRecursively.
/**
* Create the parquet metadata files for the directory at the given path and for any subdirectories.
* Metadata cache files written to the disk contain relative paths. Returned Pair of metadata contains absolute paths.
*
* @param path to the directory of the parquet table
* @param fs file system
* @param allColumnsInteresting if set, store column metadata for all the columns
* @param columnSet Set of columns for which column metadata has to be stored
* @return Pair of parquet metadata. The left one is a parquet metadata for the table. The right one of the Pair is
* a metadata for all subdirectories (if they are present and there are no any parquet files in the
* {@code path} directory).
* @throws IOException if parquet metadata can't be serialized and written to the json file
*/
private Pair<ParquetTableMetadata_v4, ParquetTableMetadataDirs> createMetaFilesRecursively(Path path, FileSystem fs, boolean allColumnsInteresting, Set<SchemaPath> columnSet) throws IOException {
Stopwatch timer = logger.isDebugEnabled() ? Stopwatch.createStarted() : null;
List<ParquetFileMetadata_v4> metaDataList = Lists.newArrayList();
List<Path> directoryList = Lists.newArrayList();
ConcurrentHashMap<ColumnTypeMetadata_v4.Key, ColumnTypeMetadata_v4> columnTypeInfoSet = new ConcurrentHashMap<>();
FileStatus fileStatus = fs.getFileStatus(path);
long dirTotalRowCount = 0;
assert fileStatus.isDirectory() : "Expected directory";
final Map<FileStatus, FileSystem> childFiles = new LinkedHashMap<>();
for (final FileStatus file : DrillFileSystemUtil.listAll(fs, path, false)) {
if (file.isDirectory()) {
ParquetTableMetadata_v4 subTableMetadata = (createMetaFilesRecursively(file.getPath(), fs, allColumnsInteresting, columnSet)).getLeft();
ConcurrentHashMap<ColumnTypeMetadata_v4.Key, ColumnTypeMetadata_v4> subTableColumnTypeInfo = subTableMetadata.getColumnTypeInfoMap();
metaDataList.addAll((List<ParquetFileMetadata_v4>) subTableMetadata.getFiles());
directoryList.addAll(subTableMetadata.getDirectories());
directoryList.add(file.getPath());
// TODO: We need a merge method that merges two columns with the same name but different types
if (columnTypeInfoSet.isEmpty()) {
columnTypeInfoSet.putAll(subTableColumnTypeInfo);
} else {
for (ColumnTypeMetadata_v4.Key key : subTableColumnTypeInfo.keySet()) {
ColumnTypeMetadata_v4 columnTypeMetadata_v4 = columnTypeInfoSet.get(key);
if (columnTypeMetadata_v4 == null) {
columnTypeMetadata_v4 = subTableColumnTypeInfo.get(key);
} else {
// as unknown
if (subTableColumnTypeInfo.get(key).totalNullCount < 0 || columnTypeMetadata_v4.totalNullCount < 0) {
columnTypeMetadata_v4.totalNullCount = NULL_COUNT_NOT_EXISTS;
} else {
columnTypeMetadata_v4.totalNullCount = columnTypeMetadata_v4.totalNullCount + subTableColumnTypeInfo.get(key).totalNullCount;
}
}
columnTypeInfoSet.put(key, columnTypeMetadata_v4);
}
}
dirTotalRowCount = dirTotalRowCount + subTableMetadata.getTotalRowCount();
} else {
childFiles.put(file, fs);
}
}
Metadata_V4.MetadataSummary metadataSummary = new Metadata_V4.MetadataSummary(SUPPORTED_VERSIONS.last().toString(), DrillVersionInfo.getVersion(), allColumnsInteresting || columnSet == null);
ParquetTableMetadata_v4 parquetTableMetadata = new ParquetTableMetadata_v4(metadataSummary);
if (childFiles.size() > 0) {
List<ParquetFileAndRowCountMetadata> childFileAndRowCountMetadata = getParquetFileMetadata_v4(parquetTableMetadata, childFiles, allColumnsInteresting, columnSet);
// If the columnTypeInfoSet is empty, add the columnTypeInfo from the parquetTableMetadata
if (columnTypeInfoSet.isEmpty()) {
columnTypeInfoSet.putAll(parquetTableMetadata.getColumnTypeInfoMap());
}
for (ParquetFileAndRowCountMetadata parquetFileAndRowCountMetadata : childFileAndRowCountMetadata) {
metaDataList.add(parquetFileAndRowCountMetadata.getFileMetadata());
dirTotalRowCount = dirTotalRowCount + parquetFileAndRowCountMetadata.getFileRowCount();
Map<ColumnTypeMetadata_v4.Key, Long> totalNullCountMap = parquetFileAndRowCountMetadata.getTotalNullCountMap();
for (ColumnTypeMetadata_v4.Key column : totalNullCountMap.keySet()) {
ColumnTypeMetadata_v4 columnTypeMetadata_v4 = columnTypeInfoSet.get(column);
// If the column is not present in columnTypeInfoSet, get it from parquetTableMetadata
if (columnTypeMetadata_v4 == null) {
columnTypeMetadata_v4 = parquetTableMetadata.getColumnTypeInfoMap().get(column);
}
// as unknown
if (columnTypeMetadata_v4.totalNullCount < 0 || totalNullCountMap.get(column) < 0) {
columnTypeMetadata_v4.totalNullCount = NULL_COUNT_NOT_EXISTS;
} else {
columnTypeMetadata_v4.totalNullCount += totalNullCountMap.get(column);
}
columnTypeInfoSet.put(column, columnTypeMetadata_v4);
}
}
}
metadataSummary.directories = directoryList;
parquetTableMetadata.assignFiles(metaDataList);
// TODO: We need a merge method that merges two columns with the same name but different types
if (metadataSummary.columnTypeInfo == null) {
metadataSummary.columnTypeInfo = new ConcurrentHashMap<>();
}
metadataSummary.columnTypeInfo.putAll(columnTypeInfoSet);
metadataSummary.allColumnsInteresting = allColumnsInteresting;
metadataSummary.totalRowCount = dirTotalRowCount;
parquetTableMetadata.metadataSummary = metadataSummary;
for (String oldName : OLD_METADATA_FILENAMES) {
fs.delete(new Path(path, oldName), false);
}
// relative paths in the metadata are only necessary for meta cache files.
ParquetTableMetadata_v4 metadataTableWithRelativePaths = MetadataPathUtils.createMetadataWithRelativePaths(parquetTableMetadata, path);
writeFile(metadataTableWithRelativePaths.fileMetadata, new Path(path, METADATA_FILENAME), fs);
writeFile(metadataTableWithRelativePaths.getSummary(), new Path(path, METADATA_SUMMARY_FILENAME), fs);
Metadata_V4.MetadataSummary metadataSummaryWithRelativePaths = metadataTableWithRelativePaths.getSummary();
// Directories list will be empty at the leaf level directories. For sub-directories with both files and directories,
// only the directories will be included in the list.
writeFile(new ParquetTableMetadataDirs(metadataSummaryWithRelativePaths.directories), new Path(path, METADATA_DIRECTORIES_FILENAME), fs);
if (timer != null) {
logger.debug("Creating metadata files recursively took {} ms", timer.elapsed(TimeUnit.MILLISECONDS));
timer.stop();
}
return Pair.of(parquetTableMetadata, new ParquetTableMetadataDirs(directoryList));
}
use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4 in project drill by apache.
the class Metadata method writeFile.
/**
* Serialize parquet metadata to json and write to a file.
*
* @param parquetMetadata parquet table or directory metadata
* @param p file path
* @param fs Drill file system
* @throws IOException if metadata can't be serialized
*/
private void writeFile(Object parquetMetadata, Path p, FileSystem fs) throws IOException {
JsonFactory jsonFactory = new JsonFactory();
jsonFactory.configure(Feature.AUTO_CLOSE_TARGET, false);
jsonFactory.configure(JsonParser.Feature.AUTO_CLOSE_SOURCE, false);
ObjectMapper mapper = new ObjectMapper(jsonFactory);
SimpleModule module = new SimpleModule();
module.addSerializer(Path.class, new PathSerDe.Se());
if (parquetMetadata instanceof Metadata_V4.FileMetadata) {
module.addSerializer(ColumnMetadata_v4.class, new ColumnMetadata_v4.Serializer());
}
mapper.registerModule(module);
OutputStream os = fs.create(p);
mapper.writerWithDefaultPrettyPrinter().writeValue(os, parquetMetadata);
os.flush();
os.close();
}
Aggregations