Search in sources :

Example 1 with ParquetFileMetadata_v4

use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileMetadata_v4 in project drill by apache.

the class Metadata method createMetaFilesRecursively.

/**
 * Create the parquet metadata files for the directory at the given path and for any subdirectories.
 * Metadata cache files written to the disk contain relative paths. Returned Pair of metadata contains absolute paths.
 *
 * @param path to the directory of the parquet table
 * @param fs file system
 * @param allColumnsInteresting if set, store column metadata for all the columns
 * @param columnSet Set of columns for which column metadata has to be stored
 * @return Pair of parquet metadata. The left one is a parquet metadata for the table. The right one of the Pair is
 *         a metadata for all subdirectories (if they are present and there are no any parquet files in the
 *         {@code path} directory).
 * @throws IOException if parquet metadata can't be serialized and written to the json file
 */
private Pair<ParquetTableMetadata_v4, ParquetTableMetadataDirs> createMetaFilesRecursively(Path path, FileSystem fs, boolean allColumnsInteresting, Set<SchemaPath> columnSet) throws IOException {
    Stopwatch timer = logger.isDebugEnabled() ? Stopwatch.createStarted() : null;
    List<ParquetFileMetadata_v4> metaDataList = Lists.newArrayList();
    List<Path> directoryList = Lists.newArrayList();
    ConcurrentHashMap<ColumnTypeMetadata_v4.Key, ColumnTypeMetadata_v4> columnTypeInfoSet = new ConcurrentHashMap<>();
    FileStatus fileStatus = fs.getFileStatus(path);
    long dirTotalRowCount = 0;
    assert fileStatus.isDirectory() : "Expected directory";
    final Map<FileStatus, FileSystem> childFiles = new LinkedHashMap<>();
    for (final FileStatus file : DrillFileSystemUtil.listAll(fs, path, false)) {
        if (file.isDirectory()) {
            ParquetTableMetadata_v4 subTableMetadata = (createMetaFilesRecursively(file.getPath(), fs, allColumnsInteresting, columnSet)).getLeft();
            ConcurrentHashMap<ColumnTypeMetadata_v4.Key, ColumnTypeMetadata_v4> subTableColumnTypeInfo = subTableMetadata.getColumnTypeInfoMap();
            metaDataList.addAll((List<ParquetFileMetadata_v4>) subTableMetadata.getFiles());
            directoryList.addAll(subTableMetadata.getDirectories());
            directoryList.add(file.getPath());
            // TODO: We need a merge method that merges two columns with the same name but different types
            if (columnTypeInfoSet.isEmpty()) {
                columnTypeInfoSet.putAll(subTableColumnTypeInfo);
            } else {
                for (ColumnTypeMetadata_v4.Key key : subTableColumnTypeInfo.keySet()) {
                    ColumnTypeMetadata_v4 columnTypeMetadata_v4 = columnTypeInfoSet.get(key);
                    if (columnTypeMetadata_v4 == null) {
                        columnTypeMetadata_v4 = subTableColumnTypeInfo.get(key);
                    } else {
                        // as unknown
                        if (subTableColumnTypeInfo.get(key).totalNullCount < 0 || columnTypeMetadata_v4.totalNullCount < 0) {
                            columnTypeMetadata_v4.totalNullCount = NULL_COUNT_NOT_EXISTS;
                        } else {
                            columnTypeMetadata_v4.totalNullCount = columnTypeMetadata_v4.totalNullCount + subTableColumnTypeInfo.get(key).totalNullCount;
                        }
                    }
                    columnTypeInfoSet.put(key, columnTypeMetadata_v4);
                }
            }
            dirTotalRowCount = dirTotalRowCount + subTableMetadata.getTotalRowCount();
        } else {
            childFiles.put(file, fs);
        }
    }
    Metadata_V4.MetadataSummary metadataSummary = new Metadata_V4.MetadataSummary(SUPPORTED_VERSIONS.last().toString(), DrillVersionInfo.getVersion(), allColumnsInteresting || columnSet == null);
    ParquetTableMetadata_v4 parquetTableMetadata = new ParquetTableMetadata_v4(metadataSummary);
    if (childFiles.size() > 0) {
        List<ParquetFileAndRowCountMetadata> childFileAndRowCountMetadata = getParquetFileMetadata_v4(parquetTableMetadata, childFiles, allColumnsInteresting, columnSet);
        // If the columnTypeInfoSet is empty, add the columnTypeInfo from the parquetTableMetadata
        if (columnTypeInfoSet.isEmpty()) {
            columnTypeInfoSet.putAll(parquetTableMetadata.getColumnTypeInfoMap());
        }
        for (ParquetFileAndRowCountMetadata parquetFileAndRowCountMetadata : childFileAndRowCountMetadata) {
            metaDataList.add(parquetFileAndRowCountMetadata.getFileMetadata());
            dirTotalRowCount = dirTotalRowCount + parquetFileAndRowCountMetadata.getFileRowCount();
            Map<ColumnTypeMetadata_v4.Key, Long> totalNullCountMap = parquetFileAndRowCountMetadata.getTotalNullCountMap();
            for (ColumnTypeMetadata_v4.Key column : totalNullCountMap.keySet()) {
                ColumnTypeMetadata_v4 columnTypeMetadata_v4 = columnTypeInfoSet.get(column);
                // If the column is not present in columnTypeInfoSet, get it from parquetTableMetadata
                if (columnTypeMetadata_v4 == null) {
                    columnTypeMetadata_v4 = parquetTableMetadata.getColumnTypeInfoMap().get(column);
                }
                // as unknown
                if (columnTypeMetadata_v4.totalNullCount < 0 || totalNullCountMap.get(column) < 0) {
                    columnTypeMetadata_v4.totalNullCount = NULL_COUNT_NOT_EXISTS;
                } else {
                    columnTypeMetadata_v4.totalNullCount += totalNullCountMap.get(column);
                }
                columnTypeInfoSet.put(column, columnTypeMetadata_v4);
            }
        }
    }
    metadataSummary.directories = directoryList;
    parquetTableMetadata.assignFiles(metaDataList);
    // TODO: We need a merge method that merges two columns with the same name but different types
    if (metadataSummary.columnTypeInfo == null) {
        metadataSummary.columnTypeInfo = new ConcurrentHashMap<>();
    }
    metadataSummary.columnTypeInfo.putAll(columnTypeInfoSet);
    metadataSummary.allColumnsInteresting = allColumnsInteresting;
    metadataSummary.totalRowCount = dirTotalRowCount;
    parquetTableMetadata.metadataSummary = metadataSummary;
    for (String oldName : OLD_METADATA_FILENAMES) {
        fs.delete(new Path(path, oldName), false);
    }
    // relative paths in the metadata are only necessary for meta cache files.
    ParquetTableMetadata_v4 metadataTableWithRelativePaths = MetadataPathUtils.createMetadataWithRelativePaths(parquetTableMetadata, path);
    writeFile(metadataTableWithRelativePaths.fileMetadata, new Path(path, METADATA_FILENAME), fs);
    writeFile(metadataTableWithRelativePaths.getSummary(), new Path(path, METADATA_SUMMARY_FILENAME), fs);
    Metadata_V4.MetadataSummary metadataSummaryWithRelativePaths = metadataTableWithRelativePaths.getSummary();
    // Directories list will be empty at the leaf level directories. For sub-directories with both files and directories,
    // only the directories will be included in the list.
    writeFile(new ParquetTableMetadataDirs(metadataSummaryWithRelativePaths.directories), new Path(path, METADATA_DIRECTORIES_FILENAME), fs);
    if (timer != null) {
        logger.debug("Creating metadata files recursively took {} ms", timer.elapsed(TimeUnit.MILLISECONDS));
        timer.stop();
    }
    return Pair.of(parquetTableMetadata, new ParquetTableMetadataDirs(directoryList));
}
Also used : ColumnTypeMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ColumnTypeMetadata_v4) FileStatus(org.apache.hadoop.fs.FileStatus) Stopwatch(org.apache.drill.shaded.guava.com.google.common.base.Stopwatch) LinkedHashMap(java.util.LinkedHashMap) MetadataSummary(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.MetadataSummary) FileSystem(org.apache.hadoop.fs.FileSystem) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) ParquetFileAndRowCountMetadata(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileAndRowCountMetadata) ParquetTableMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetTableMetadata_v4) MetadataSummary(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.MetadataSummary) ParquetFileMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileMetadata_v4)

Example 2 with ParquetFileMetadata_v4

use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileMetadata_v4 in project drill by apache.

the class Metadata method getParquetTableMetadata.

/**
 * Get the parquet metadata for a list of parquet files
 *
 * @param fileStatusMap file statuses and corresponding file systems
 * @return parquet table metadata object
 * @throws IOException if parquet file metadata can't be obtained
 */
private ParquetTableMetadata_v4 getParquetTableMetadata(Map<FileStatus, FileSystem> fileStatusMap) throws IOException {
    Metadata_V4.MetadataSummary tableMetadataSummary = new Metadata_V4.MetadataSummary(SUPPORTED_VERSIONS.last().toString(), DrillVersionInfo.getVersion(), new ArrayList<>(), true);
    ParquetTableMetadata_v4 tableMetadata = new ParquetTableMetadata_v4(tableMetadataSummary);
    List<ParquetFileAndRowCountMetadata> parquetFileAndRowCountMetadata = getParquetFileMetadata_v4(tableMetadata, fileStatusMap, true, null);
    List<ParquetFileMetadata_v4> parquetFileMetadata = new ArrayList<>();
    for (ParquetFileAndRowCountMetadata fileAndGlobalMetadata : parquetFileAndRowCountMetadata) {
        parquetFileMetadata.add(fileAndGlobalMetadata.getFileMetadata());
    }
    tableMetadata.assignFiles(parquetFileMetadata);
    return tableMetadata;
}
Also used : ParquetFileAndRowCountMetadata(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileAndRowCountMetadata) MetadataSummary(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.MetadataSummary) ArrayList(java.util.ArrayList) ParquetTableMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetTableMetadata_v4) MetadataSummary(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.MetadataSummary) ParquetFileMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileMetadata_v4)

Example 3 with ParquetFileMetadata_v4

use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileMetadata_v4 in project drill by apache.

the class MetadataPathUtils method convertToFilesWithAbsolutePaths.

/**
 * Convert a list of files with relative paths to files with absolute ones
 *
 * @param files list of files with relative paths
 * @param baseDir base parent directory
 * @return list of files with absolute paths
 */
public static List<? extends ParquetFileMetadata> convertToFilesWithAbsolutePaths(List<? extends ParquetFileMetadata> files, String baseDir) {
    if (!files.isEmpty()) {
        List<ParquetFileMetadata> filesWithAbsolutePaths = new ArrayList<>();
        for (ParquetFileMetadata file : files) {
            Path relativePath = file.getPath();
            ParquetFileMetadata fileWithAbsolutePath = null;
            // create a new file if old one contains a relative path, otherwise use an old file
            if (file instanceof ParquetFileMetadata_v4) {
                fileWithAbsolutePath = (relativePath.isAbsolute()) ? file : new ParquetFileMetadata_v4(new Path(baseDir, relativePath), file.getLength(), (List<Metadata_V4.RowGroupMetadata_v4>) file.getRowGroups());
            } else if (file instanceof ParquetFileMetadata_v3) {
                fileWithAbsolutePath = (relativePath.isAbsolute()) ? file : new ParquetFileMetadata_v3(new Path(baseDir, relativePath), file.getLength(), (List<Metadata_V3.RowGroupMetadata_v3>) file.getRowGroups());
            }
            filesWithAbsolutePaths.add(fileWithAbsolutePath);
        }
        return filesWithAbsolutePaths;
    }
    return files;
}
Also used : Path(org.apache.hadoop.fs.Path) ParquetFileMetadata(org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) ParquetFileMetadata_v3(org.apache.drill.exec.store.parquet.metadata.Metadata_V3.ParquetFileMetadata_v3) ParquetFileMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileMetadata_v4)

Example 4 with ParquetFileMetadata_v4

use of org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileMetadata_v4 in project drill by apache.

the class MetadataPathUtils method createMetadataWithRelativePaths.

/**
 * Creates a new parquet table metadata from the {@code tableMetadataWithAbsolutePaths} parquet table.
 * A new parquet table will contain relative paths for the files and directories.
 *
 * @param tableMetadataWithAbsolutePaths parquet table metadata with absolute paths for the files and directories
 * @param baseDir base parent directory
 * @return parquet table metadata with relative paths for the files and directories
 */
public static ParquetTableMetadata_v4 createMetadataWithRelativePaths(ParquetTableMetadata_v4 tableMetadataWithAbsolutePaths, Path baseDir) {
    List<Path> directoriesWithRelativePaths = new ArrayList<>();
    for (Path directory : tableMetadataWithAbsolutePaths.getDirectories()) {
        directoriesWithRelativePaths.add(relativize(baseDir, directory));
    }
    List<ParquetFileMetadata_v4> filesWithRelativePaths = new ArrayList<>();
    for (ParquetFileMetadata_v4 file : (List<ParquetFileMetadata_v4>) tableMetadataWithAbsolutePaths.getFiles()) {
        filesWithRelativePaths.add(new ParquetFileMetadata_v4(relativize(baseDir, file.getPath()), file.length, file.rowGroups));
    }
    return new ParquetTableMetadata_v4(SUPPORTED_VERSIONS.last().toString(), tableMetadataWithAbsolutePaths, filesWithRelativePaths, directoriesWithRelativePaths, DrillVersionInfo.getVersion(), tableMetadataWithAbsolutePaths.getTotalRowCount(), tableMetadataWithAbsolutePaths.isAllColumnsInteresting());
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) ParquetTableMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetTableMetadata_v4) ParquetFileMetadata_v4(org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileMetadata_v4)

Aggregations

ParquetFileMetadata_v4 (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileMetadata_v4)4 ArrayList (java.util.ArrayList)3 ParquetTableMetadata_v4 (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetTableMetadata_v4)3 Path (org.apache.hadoop.fs.Path)3 List (java.util.List)2 MetadataSummary (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.MetadataSummary)2 ParquetFileAndRowCountMetadata (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ParquetFileAndRowCountMetadata)2 LinkedHashMap (java.util.LinkedHashMap)1 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)1 SchemaPath (org.apache.drill.common.expression.SchemaPath)1 ParquetFileMetadata (org.apache.drill.exec.store.parquet.metadata.MetadataBase.ParquetFileMetadata)1 ParquetFileMetadata_v3 (org.apache.drill.exec.store.parquet.metadata.Metadata_V3.ParquetFileMetadata_v3)1 ColumnTypeMetadata_v4 (org.apache.drill.exec.store.parquet.metadata.Metadata_V4.ColumnTypeMetadata_v4)1 Stopwatch (org.apache.drill.shaded.guava.com.google.common.base.Stopwatch)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1