Search in sources :

Example 1 with TimedRunnable

use of org.apache.drill.exec.store.TimedRunnable in project drill by apache.

the class Metadata method getParquetFileMetadata_v3.

/**
   * Get a list of file metadata for a list of parquet files
   *
   * @param fileStatuses
   * @return
   * @throws IOException
   */
private List<ParquetFileMetadata_v3> getParquetFileMetadata_v3(ParquetTableMetadata_v3 parquetTableMetadata_v3, List<FileStatus> fileStatuses) throws IOException {
    List<TimedRunnable<ParquetFileMetadata_v3>> gatherers = Lists.newArrayList();
    for (FileStatus file : fileStatuses) {
        gatherers.add(new MetadataGatherer(parquetTableMetadata_v3, file));
    }
    List<ParquetFileMetadata_v3> metaDataList = Lists.newArrayList();
    metaDataList.addAll(TimedRunnable.run("Fetch parquet metadata", logger, gatherers, 16));
    return metaDataList;
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) TimedRunnable(org.apache.drill.exec.store.TimedRunnable)

Example 2 with TimedRunnable

use of org.apache.drill.exec.store.TimedRunnable in project drill by axbaretto.

the class FooterGatherer method getFooters.

public static List<Footer> getFooters(final Configuration conf, List<FileStatus> statuses, int parallelism) throws IOException {
    final List<TimedRunnable<Footer>> readers = Lists.newArrayList();
    List<Footer> foundFooters = Lists.newArrayList();
    for (FileStatus status : statuses) {
        if (status.isDirectory()) {
            // first we check for summary file.
            FileSystem fs = status.getPath().getFileSystem(conf);
            final Path summaryPath = new Path(status.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
            if (fs.exists(summaryPath)) {
                FileStatus summaryStatus = fs.getFileStatus(summaryPath);
                foundFooters.addAll(ParquetFileReader.readSummaryFile(conf, summaryStatus));
                continue;
            }
            // else we handle as normal file.
            for (FileStatus inStatus : DrillFileSystemUtil.listFiles(fs, status.getPath(), false)) {
                readers.add(new FooterReader(conf, inStatus));
            }
        } else {
            readers.add(new FooterReader(conf, status));
        }
    }
    if (!readers.isEmpty()) {
        foundFooters.addAll(TimedRunnable.run("Fetch Parquet Footers", logger, readers, parallelism));
    }
    return foundFooters;
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) Footer(org.apache.parquet.hadoop.Footer) TimedRunnable(org.apache.drill.exec.store.TimedRunnable)

Example 3 with TimedRunnable

use of org.apache.drill.exec.store.TimedRunnable in project drill by axbaretto.

the class BlockMapBuilder method generateFileWork.

public List<CompleteFileWork> generateFileWork(List<FileStatus> files, boolean blockify) throws IOException {
    List<TimedRunnable<List<CompleteFileWork>>> readers = Lists.newArrayList();
    for (FileStatus status : files) {
        readers.add(new BlockMapReader(status, blockify));
    }
    List<List<CompleteFileWork>> work = TimedRunnable.run("Get block maps", logger, readers, 16);
    List<CompleteFileWork> singleList = Lists.newArrayList();
    for (List<CompleteFileWork> innerWorkList : work) {
        singleList.addAll(innerWorkList);
    }
    return singleList;
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) List(java.util.List) TimedRunnable(org.apache.drill.exec.store.TimedRunnable)

Example 4 with TimedRunnable

use of org.apache.drill.exec.store.TimedRunnable in project drill by axbaretto.

the class Metadata method getParquetFileMetadata_v3.

/**
 * Get a list of file metadata for a list of parquet files
 *
 * @param parquetTableMetadata_v3 can store column schema info from all the files and row groups
 * @param fileStatuses list of the parquet files statuses
 *
 * @return list of the parquet file metadata with absolute paths
 * @throws IOException is thrown in case of issues while executing the list of runnables
 */
private List<ParquetFileMetadata_v3> getParquetFileMetadata_v3(ParquetTableMetadata_v3 parquetTableMetadata_v3, List<FileStatus> fileStatuses) throws IOException {
    List<TimedRunnable<ParquetFileMetadata_v3>> gatherers = Lists.newArrayList();
    for (FileStatus file : fileStatuses) {
        gatherers.add(new MetadataGatherer(parquetTableMetadata_v3, file));
    }
    List<ParquetFileMetadata_v3> metaDataList = Lists.newArrayList();
    metaDataList.addAll(TimedRunnable.run("Fetch parquet metadata", logger, gatherers, 16));
    return metaDataList;
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) TimedRunnable(org.apache.drill.exec.store.TimedRunnable)

Aggregations

TimedRunnable (org.apache.drill.exec.store.TimedRunnable)4 FileStatus (org.apache.hadoop.fs.FileStatus)4 List (java.util.List)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 Footer (org.apache.parquet.hadoop.Footer)1