Search in sources :

Example 11 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by axbaretto.

the class TestParquetFilterPushDown method testIntPredicateAgainstAllNullColWithEval.

@Test
public void testIntPredicateAgainstAllNullColWithEval() throws Exception {
    // intAllNull.parquet has only one int column with all values being NULL.
    // column values statistics: num_nulls: 25, min/max is not defined
    final File file = dirTestWatcher.getRootDir().toPath().resolve(Paths.get("parquetFilterPush", "intTbl", "intAllNull.parquet")).toFile();
    ParquetMetadata footer = getParquetMetaData(file);
    testParquetRowGroupFilterEval(footer, "intCol = 100", true);
    testParquetRowGroupFilterEval(footer, "intCol = 0", true);
    testParquetRowGroupFilterEval(footer, "intCol = -100", true);
    testParquetRowGroupFilterEval(footer, "intCol > 10", true);
    testParquetRowGroupFilterEval(footer, "intCol >= 10", true);
    testParquetRowGroupFilterEval(footer, "intCol < 10", true);
    testParquetRowGroupFilterEval(footer, "intCol <= 10", true);
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) File(java.io.File) Test(org.junit.Test)

Example 12 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by axbaretto.

the class TestParquetFilterPushDown method testTimeStampPredicateWithEval.

@Test
public void testTimeStampPredicateWithEval() throws Exception {
    // Table dateTblCorrupted is created by CTAS in drill 1.8.0.
    // create table dfs.tmp.`tsTbl/t1` as select DATE_ADD(cast(o_orderdate as date), INTERVAL '0 10:20:30' DAY TO SECOND) as o_ordertimestamp from cp.`tpch/orders.parquet` where o_orderdate between date '1992-01-01' and date '1992-01-03';
    final File file = dirTestWatcher.getRootDir().toPath().resolve(Paths.get("parquetFilterPush", "tsTbl", "t1", "0_0_0.parquet")).toFile();
    ParquetMetadata footer = getParquetMetaData(file);
    testParquetRowGroupFilterEval(footer, "o_ordertimestamp = cast('1992-01-01 10:20:30' as timestamp)", false);
    testParquetRowGroupFilterEval(footer, "o_ordertimestamp = cast('1992-01-01 10:20:29' as timestamp)", true);
    testParquetRowGroupFilterEval(footer, "o_ordertimestamp >= cast('1992-01-01 10:20:29' as timestamp)", false);
    testParquetRowGroupFilterEval(footer, "o_ordertimestamp >= cast('1992-01-03 10:20:30' as timestamp)", false);
    testParquetRowGroupFilterEval(footer, "o_ordertimestamp >= cast('1992-01-03 10:20:31' as timestamp)", true);
    testParquetRowGroupFilterEval(footer, "o_ordertimestamp > cast('1992-01-03 10:20:29' as timestamp)", false);
    testParquetRowGroupFilterEval(footer, "o_ordertimestamp > cast('1992-01-03 10:20:30' as timestamp)", true);
    testParquetRowGroupFilterEval(footer, "o_ordertimestamp <= cast('1992-01-01 10:20:30' as timestamp)", false);
    testParquetRowGroupFilterEval(footer, "o_ordertimestamp <= cast('1992-01-01 10:20:29' as timestamp)", true);
    testParquetRowGroupFilterEval(footer, "o_ordertimestamp < cast('1992-01-01 10:20:31' as timestamp)", false);
    testParquetRowGroupFilterEval(footer, "o_ordertimestamp < cast('1992-01-01 10:20:30' as timestamp)", true);
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) File(java.io.File) Test(org.junit.Test)

Example 13 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by axbaretto.

the class TestParquetFilterPushDown method testDatePredicateAgainstDrillCTAS1_8WithEval.

@Test
public void testDatePredicateAgainstDrillCTAS1_8WithEval() throws Exception {
    // The parquet file is created on drill 1.8.0 with DRILL CTAS:
    // create table dfs.tmp.`dateTblCorrupted/t1` as select cast(o_orderdate as date) as o_orderdate from cp.`tpch/orders.parquet` where o_orderdate between date '1992-01-01' and date '1992-01-03';
    final File file = dirTestWatcher.getRootDir().toPath().resolve(Paths.get("parquetFilterPush", "dateTblCorrupted", "t1", "0_0_0.parquet")).toFile();
    ParquetMetadata footer = getParquetMetaData(file);
    testDatePredicateAgainstDrillCTASHelper(footer);
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) File(java.io.File) Test(org.junit.Test)

Example 14 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by axbaretto.

the class TestParquetFilterPushDown method testIntPredicateWithEval.

@Test
public // Test filter evaluation directly without go through SQL queries.
void testIntPredicateWithEval() throws Exception {
    // intTbl.parquet has only one int column
    // intCol : [0, 100].
    final File file = dirTestWatcher.getRootDir().toPath().resolve(Paths.get("parquetFilterPush", "intTbl", "intTbl.parquet")).toFile();
    ParquetMetadata footer = getParquetMetaData(file);
    testParquetRowGroupFilterEval(footer, "intCol = 100", false);
    testParquetRowGroupFilterEval(footer, "intCol = 0", false);
    testParquetRowGroupFilterEval(footer, "intCol = 50", false);
    testParquetRowGroupFilterEval(footer, "intCol = -1", true);
    testParquetRowGroupFilterEval(footer, "intCol = 101", true);
    testParquetRowGroupFilterEval(footer, "intCol > 100", true);
    testParquetRowGroupFilterEval(footer, "intCol > 99", false);
    testParquetRowGroupFilterEval(footer, "intCol >= 100", false);
    testParquetRowGroupFilterEval(footer, "intCol >= 101", true);
    testParquetRowGroupFilterEval(footer, "intCol < 100", false);
    testParquetRowGroupFilterEval(footer, "intCol < 1", false);
    testParquetRowGroupFilterEval(footer, "intCol < 0", true);
    testParquetRowGroupFilterEval(footer, "intCol <= 100", false);
    testParquetRowGroupFilterEval(footer, "intCol <= 1", false);
    testParquetRowGroupFilterEval(footer, "intCol <= 0", false);
    testParquetRowGroupFilterEval(footer, "intCol <= -1", true);
    // "and"
    testParquetRowGroupFilterEval(footer, "intCol > 100 and intCol  < 200", true);
    testParquetRowGroupFilterEval(footer, "intCol > 50 and intCol < 200", false);
    // essentially, intCol > 200
    testParquetRowGroupFilterEval(footer, "intCol > 50 and intCol > 200", true);
    // "or"
    testParquetRowGroupFilterEval(footer, "intCol = 150 or intCol = 160", true);
    testParquetRowGroupFilterEval(footer, "intCol = 50 or intCol = 160", false);
    // "nonExistCol" does not exist in the table. "AND" with a filter on exist column
    testParquetRowGroupFilterEval(footer, "intCol > 100 and nonExistCol = 100", true);
    // since nonExistCol = 100 -> Unknown -> could drop.
    testParquetRowGroupFilterEval(footer, "intCol > 50 and nonExistCol = 100", true);
    // since nonExistCol = 100 -> Unknown -> could drop.
    testParquetRowGroupFilterEval(footer, "nonExistCol = 100 and intCol > 50", true);
    testParquetRowGroupFilterEval(footer, "intCol > 100 and nonExistCol < 'abc'", true);
    // nonExistCol < 'abc' hit NumberException and is ignored, but intCol >100 will say "drop".
    testParquetRowGroupFilterEval(footer, "nonExistCol < 'abc' and intCol > 100", true);
    // because nonExistCol < 'abc' hit NumberException and is ignored.
    testParquetRowGroupFilterEval(footer, "intCol > 50 and nonExistCol < 'abc'", false);
    // "nonExistCol" does not exist in the table. "OR" with a filter on exist column
    // nonExistCol = 100 -> could drop.
    testParquetRowGroupFilterEval(footer, "intCol > 100 or nonExistCol = 100", true);
    // nonExistCol = 100 -> could drop.
    testParquetRowGroupFilterEval(footer, "nonExistCol = 100 or intCol > 100", true);
    testParquetRowGroupFilterEval(footer, "intCol > 50 or nonExistCol < 100", false);
    testParquetRowGroupFilterEval(footer, "nonExistCol < 100 or intCol > 50", false);
    // cast function on column side (LHS)
    testParquetRowGroupFilterEval(footer, "cast(intCol as bigint) = 100", false);
    testParquetRowGroupFilterEval(footer, "cast(intCol as bigint) = 0", false);
    testParquetRowGroupFilterEval(footer, "cast(intCol as bigint) = 50", false);
    testParquetRowGroupFilterEval(footer, "cast(intCol as bigint) = 101", true);
    testParquetRowGroupFilterEval(footer, "cast(intCol as bigint) = -1", true);
    // cast function on constant side (RHS)
    testParquetRowGroupFilterEval(footer, "intCol = cast(100 as bigint)", false);
    testParquetRowGroupFilterEval(footer, "intCol = cast(0 as bigint)", false);
    testParquetRowGroupFilterEval(footer, "intCol = cast(50 as bigint)", false);
    testParquetRowGroupFilterEval(footer, "intCol = cast(101 as bigint)", true);
    testParquetRowGroupFilterEval(footer, "intCol = cast(-1 as bigint)", true);
    // cast into float4/float8
    testParquetRowGroupFilterEval(footer, "cast(intCol as float4) = cast(101.0 as float4)", true);
    testParquetRowGroupFilterEval(footer, "cast(intCol as float4) = cast(-1.0 as float4)", true);
    testParquetRowGroupFilterEval(footer, "cast(intCol as float4) = cast(1.0 as float4)", false);
    testParquetRowGroupFilterEval(footer, "cast(intCol as float8) = 101.0", true);
    testParquetRowGroupFilterEval(footer, "cast(intCol as float8) = -1.0", true);
    testParquetRowGroupFilterEval(footer, "cast(intCol as float8) = 1.0", false);
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) File(java.io.File) Test(org.junit.Test)

Example 15 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class ParquetFileReader method readAllFootersInParallelUsingSummaryFiles.

/**
 * for files provided, check if there's a summary file.
 * If a summary file is found it is used otherwise the file footer is used.
 * @param configuration the hadoop conf to connect to the file system;
 * @param partFiles the part files to read
 * @param skipRowGroups to skipRowGroups in the footers
 * @return the footers for those files using the summary file if possible.
 * @throws IOException
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
public static List<Footer> readAllFootersInParallelUsingSummaryFiles(final Configuration configuration, final Collection<FileStatus> partFiles, final boolean skipRowGroups) throws IOException {
    // figure out list of all parents to part files
    Set<Path> parents = new HashSet<Path>();
    for (FileStatus part : partFiles) {
        parents.add(part.getPath().getParent());
    }
    // read corresponding summary files if they exist
    List<Callable<Map<Path, Footer>>> summaries = new ArrayList<Callable<Map<Path, Footer>>>();
    for (final Path path : parents) {
        summaries.add(new Callable<Map<Path, Footer>>() {

            @Override
            public Map<Path, Footer> call() throws Exception {
                ParquetMetadata mergedMetadata = readSummaryMetadata(configuration, path, skipRowGroups);
                if (mergedMetadata != null) {
                    final List<Footer> footers;
                    if (skipRowGroups) {
                        footers = new ArrayList<Footer>();
                        for (FileStatus f : partFiles) {
                            footers.add(new Footer(f.getPath(), mergedMetadata));
                        }
                    } else {
                        footers = footersFromSummaryFile(path, mergedMetadata);
                    }
                    Map<Path, Footer> map = new HashMap<Path, Footer>();
                    for (Footer footer : footers) {
                        // the folder may have been moved
                        footer = new Footer(new Path(path, footer.getFile().getName()), footer.getParquetMetadata());
                        map.put(footer.getFile(), footer);
                    }
                    return map;
                } else {
                    return Collections.emptyMap();
                }
            }
        });
    }
    Map<Path, Footer> cache = new HashMap<Path, Footer>();
    try {
        List<Map<Path, Footer>> footersFromSummaries = runAllInParallel(configuration.getInt(PARQUET_READ_PARALLELISM, 5), summaries);
        for (Map<Path, Footer> footers : footersFromSummaries) {
            cache.putAll(footers);
        }
    } catch (ExecutionException e) {
        throw new IOException("Error reading summaries", e);
    }
    // keep only footers for files actually requested and read file footer if not found in summaries
    List<Footer> result = new ArrayList<Footer>(partFiles.size());
    List<FileStatus> toRead = new ArrayList<FileStatus>();
    for (FileStatus part : partFiles) {
        Footer f = cache.get(part.getPath());
        if (f != null) {
            result.add(f);
        } else {
            toRead.add(part);
        }
    }
    if (toRead.size() > 0) {
        // read the footers of the files that did not have a summary file
        LOG.info("reading another {} footers", toRead.size());
        result.addAll(readAllFootersInParallel(configuration, toRead, skipRowGroups));
    }
    return result;
}
Also used : Path(org.apache.hadoop.fs.Path) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) FileStatus(org.apache.hadoop.fs.FileStatus) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) IOException(java.io.IOException) Callable(java.util.concurrent.Callable) ParquetDecodingException(org.apache.parquet.io.ParquetDecodingException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) List(java.util.List) ArrayList(java.util.ArrayList) ExecutionException(java.util.concurrent.ExecutionException) Map(java.util.Map) HashMap(java.util.HashMap) HashSet(java.util.HashSet)

Aggregations

ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)76 Path (org.apache.hadoop.fs.Path)39 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)27 Configuration (org.apache.hadoop.conf.Configuration)21 MessageType (org.apache.parquet.schema.MessageType)21 ArrayList (java.util.ArrayList)19 IOException (java.io.IOException)18 Test (org.junit.Test)17 FileSystem (org.apache.hadoop.fs.FileSystem)16 Map (java.util.Map)11 FileMetaData (org.apache.parquet.hadoop.metadata.FileMetaData)11 File (java.io.File)10 FileStatus (org.apache.hadoop.fs.FileStatus)10 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)9 HashMap (java.util.HashMap)8 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)7 List (java.util.List)6 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)6 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)6 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)6