use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by axbaretto.
the class TestParquetFilterPushDown method testIntPredicateAgainstAllNullColWithEval.
@Test
public void testIntPredicateAgainstAllNullColWithEval() throws Exception {
// intAllNull.parquet has only one int column with all values being NULL.
// column values statistics: num_nulls: 25, min/max is not defined
final File file = dirTestWatcher.getRootDir().toPath().resolve(Paths.get("parquetFilterPush", "intTbl", "intAllNull.parquet")).toFile();
ParquetMetadata footer = getParquetMetaData(file);
testParquetRowGroupFilterEval(footer, "intCol = 100", true);
testParquetRowGroupFilterEval(footer, "intCol = 0", true);
testParquetRowGroupFilterEval(footer, "intCol = -100", true);
testParquetRowGroupFilterEval(footer, "intCol > 10", true);
testParquetRowGroupFilterEval(footer, "intCol >= 10", true);
testParquetRowGroupFilterEval(footer, "intCol < 10", true);
testParquetRowGroupFilterEval(footer, "intCol <= 10", true);
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by axbaretto.
the class TestParquetFilterPushDown method testTimeStampPredicateWithEval.
@Test
public void testTimeStampPredicateWithEval() throws Exception {
// Table dateTblCorrupted is created by CTAS in drill 1.8.0.
// create table dfs.tmp.`tsTbl/t1` as select DATE_ADD(cast(o_orderdate as date), INTERVAL '0 10:20:30' DAY TO SECOND) as o_ordertimestamp from cp.`tpch/orders.parquet` where o_orderdate between date '1992-01-01' and date '1992-01-03';
final File file = dirTestWatcher.getRootDir().toPath().resolve(Paths.get("parquetFilterPush", "tsTbl", "t1", "0_0_0.parquet")).toFile();
ParquetMetadata footer = getParquetMetaData(file);
testParquetRowGroupFilterEval(footer, "o_ordertimestamp = cast('1992-01-01 10:20:30' as timestamp)", false);
testParquetRowGroupFilterEval(footer, "o_ordertimestamp = cast('1992-01-01 10:20:29' as timestamp)", true);
testParquetRowGroupFilterEval(footer, "o_ordertimestamp >= cast('1992-01-01 10:20:29' as timestamp)", false);
testParquetRowGroupFilterEval(footer, "o_ordertimestamp >= cast('1992-01-03 10:20:30' as timestamp)", false);
testParquetRowGroupFilterEval(footer, "o_ordertimestamp >= cast('1992-01-03 10:20:31' as timestamp)", true);
testParquetRowGroupFilterEval(footer, "o_ordertimestamp > cast('1992-01-03 10:20:29' as timestamp)", false);
testParquetRowGroupFilterEval(footer, "o_ordertimestamp > cast('1992-01-03 10:20:30' as timestamp)", true);
testParquetRowGroupFilterEval(footer, "o_ordertimestamp <= cast('1992-01-01 10:20:30' as timestamp)", false);
testParquetRowGroupFilterEval(footer, "o_ordertimestamp <= cast('1992-01-01 10:20:29' as timestamp)", true);
testParquetRowGroupFilterEval(footer, "o_ordertimestamp < cast('1992-01-01 10:20:31' as timestamp)", false);
testParquetRowGroupFilterEval(footer, "o_ordertimestamp < cast('1992-01-01 10:20:30' as timestamp)", true);
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by axbaretto.
the class TestParquetFilterPushDown method testDatePredicateAgainstDrillCTAS1_8WithEval.
@Test
public void testDatePredicateAgainstDrillCTAS1_8WithEval() throws Exception {
// The parquet file is created on drill 1.8.0 with DRILL CTAS:
// create table dfs.tmp.`dateTblCorrupted/t1` as select cast(o_orderdate as date) as o_orderdate from cp.`tpch/orders.parquet` where o_orderdate between date '1992-01-01' and date '1992-01-03';
final File file = dirTestWatcher.getRootDir().toPath().resolve(Paths.get("parquetFilterPush", "dateTblCorrupted", "t1", "0_0_0.parquet")).toFile();
ParquetMetadata footer = getParquetMetaData(file);
testDatePredicateAgainstDrillCTASHelper(footer);
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by axbaretto.
the class TestParquetFilterPushDown method testIntPredicateWithEval.
@Test
public // Test filter evaluation directly without go through SQL queries.
void testIntPredicateWithEval() throws Exception {
// intTbl.parquet has only one int column
// intCol : [0, 100].
final File file = dirTestWatcher.getRootDir().toPath().resolve(Paths.get("parquetFilterPush", "intTbl", "intTbl.parquet")).toFile();
ParquetMetadata footer = getParquetMetaData(file);
testParquetRowGroupFilterEval(footer, "intCol = 100", false);
testParquetRowGroupFilterEval(footer, "intCol = 0", false);
testParquetRowGroupFilterEval(footer, "intCol = 50", false);
testParquetRowGroupFilterEval(footer, "intCol = -1", true);
testParquetRowGroupFilterEval(footer, "intCol = 101", true);
testParquetRowGroupFilterEval(footer, "intCol > 100", true);
testParquetRowGroupFilterEval(footer, "intCol > 99", false);
testParquetRowGroupFilterEval(footer, "intCol >= 100", false);
testParquetRowGroupFilterEval(footer, "intCol >= 101", true);
testParquetRowGroupFilterEval(footer, "intCol < 100", false);
testParquetRowGroupFilterEval(footer, "intCol < 1", false);
testParquetRowGroupFilterEval(footer, "intCol < 0", true);
testParquetRowGroupFilterEval(footer, "intCol <= 100", false);
testParquetRowGroupFilterEval(footer, "intCol <= 1", false);
testParquetRowGroupFilterEval(footer, "intCol <= 0", false);
testParquetRowGroupFilterEval(footer, "intCol <= -1", true);
// "and"
testParquetRowGroupFilterEval(footer, "intCol > 100 and intCol < 200", true);
testParquetRowGroupFilterEval(footer, "intCol > 50 and intCol < 200", false);
// essentially, intCol > 200
testParquetRowGroupFilterEval(footer, "intCol > 50 and intCol > 200", true);
// "or"
testParquetRowGroupFilterEval(footer, "intCol = 150 or intCol = 160", true);
testParquetRowGroupFilterEval(footer, "intCol = 50 or intCol = 160", false);
// "nonExistCol" does not exist in the table. "AND" with a filter on exist column
testParquetRowGroupFilterEval(footer, "intCol > 100 and nonExistCol = 100", true);
// since nonExistCol = 100 -> Unknown -> could drop.
testParquetRowGroupFilterEval(footer, "intCol > 50 and nonExistCol = 100", true);
// since nonExistCol = 100 -> Unknown -> could drop.
testParquetRowGroupFilterEval(footer, "nonExistCol = 100 and intCol > 50", true);
testParquetRowGroupFilterEval(footer, "intCol > 100 and nonExistCol < 'abc'", true);
// nonExistCol < 'abc' hit NumberException and is ignored, but intCol >100 will say "drop".
testParquetRowGroupFilterEval(footer, "nonExistCol < 'abc' and intCol > 100", true);
// because nonExistCol < 'abc' hit NumberException and is ignored.
testParquetRowGroupFilterEval(footer, "intCol > 50 and nonExistCol < 'abc'", false);
// "nonExistCol" does not exist in the table. "OR" with a filter on exist column
// nonExistCol = 100 -> could drop.
testParquetRowGroupFilterEval(footer, "intCol > 100 or nonExistCol = 100", true);
// nonExistCol = 100 -> could drop.
testParquetRowGroupFilterEval(footer, "nonExistCol = 100 or intCol > 100", true);
testParquetRowGroupFilterEval(footer, "intCol > 50 or nonExistCol < 100", false);
testParquetRowGroupFilterEval(footer, "nonExistCol < 100 or intCol > 50", false);
// cast function on column side (LHS)
testParquetRowGroupFilterEval(footer, "cast(intCol as bigint) = 100", false);
testParquetRowGroupFilterEval(footer, "cast(intCol as bigint) = 0", false);
testParquetRowGroupFilterEval(footer, "cast(intCol as bigint) = 50", false);
testParquetRowGroupFilterEval(footer, "cast(intCol as bigint) = 101", true);
testParquetRowGroupFilterEval(footer, "cast(intCol as bigint) = -1", true);
// cast function on constant side (RHS)
testParquetRowGroupFilterEval(footer, "intCol = cast(100 as bigint)", false);
testParquetRowGroupFilterEval(footer, "intCol = cast(0 as bigint)", false);
testParquetRowGroupFilterEval(footer, "intCol = cast(50 as bigint)", false);
testParquetRowGroupFilterEval(footer, "intCol = cast(101 as bigint)", true);
testParquetRowGroupFilterEval(footer, "intCol = cast(-1 as bigint)", true);
// cast into float4/float8
testParquetRowGroupFilterEval(footer, "cast(intCol as float4) = cast(101.0 as float4)", true);
testParquetRowGroupFilterEval(footer, "cast(intCol as float4) = cast(-1.0 as float4)", true);
testParquetRowGroupFilterEval(footer, "cast(intCol as float4) = cast(1.0 as float4)", false);
testParquetRowGroupFilterEval(footer, "cast(intCol as float8) = 101.0", true);
testParquetRowGroupFilterEval(footer, "cast(intCol as float8) = -1.0", true);
testParquetRowGroupFilterEval(footer, "cast(intCol as float8) = 1.0", false);
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.
the class ParquetFileReader method readAllFootersInParallelUsingSummaryFiles.
/**
* for files provided, check if there's a summary file.
* If a summary file is found it is used otherwise the file footer is used.
* @param configuration the hadoop conf to connect to the file system;
* @param partFiles the part files to read
* @param skipRowGroups to skipRowGroups in the footers
* @return the footers for those files using the summary file if possible.
* @throws IOException
* @deprecated metadata files are not recommended and will be removed in 2.0.0
*/
@Deprecated
public static List<Footer> readAllFootersInParallelUsingSummaryFiles(final Configuration configuration, final Collection<FileStatus> partFiles, final boolean skipRowGroups) throws IOException {
// figure out list of all parents to part files
Set<Path> parents = new HashSet<Path>();
for (FileStatus part : partFiles) {
parents.add(part.getPath().getParent());
}
// read corresponding summary files if they exist
List<Callable<Map<Path, Footer>>> summaries = new ArrayList<Callable<Map<Path, Footer>>>();
for (final Path path : parents) {
summaries.add(new Callable<Map<Path, Footer>>() {
@Override
public Map<Path, Footer> call() throws Exception {
ParquetMetadata mergedMetadata = readSummaryMetadata(configuration, path, skipRowGroups);
if (mergedMetadata != null) {
final List<Footer> footers;
if (skipRowGroups) {
footers = new ArrayList<Footer>();
for (FileStatus f : partFiles) {
footers.add(new Footer(f.getPath(), mergedMetadata));
}
} else {
footers = footersFromSummaryFile(path, mergedMetadata);
}
Map<Path, Footer> map = new HashMap<Path, Footer>();
for (Footer footer : footers) {
// the folder may have been moved
footer = new Footer(new Path(path, footer.getFile().getName()), footer.getParquetMetadata());
map.put(footer.getFile(), footer);
}
return map;
} else {
return Collections.emptyMap();
}
}
});
}
Map<Path, Footer> cache = new HashMap<Path, Footer>();
try {
List<Map<Path, Footer>> footersFromSummaries = runAllInParallel(configuration.getInt(PARQUET_READ_PARALLELISM, 5), summaries);
for (Map<Path, Footer> footers : footersFromSummaries) {
cache.putAll(footers);
}
} catch (ExecutionException e) {
throw new IOException("Error reading summaries", e);
}
// keep only footers for files actually requested and read file footer if not found in summaries
List<Footer> result = new ArrayList<Footer>(partFiles.size());
List<FileStatus> toRead = new ArrayList<FileStatus>();
for (FileStatus part : partFiles) {
Footer f = cache.get(part.getPath());
if (f != null) {
result.add(f);
} else {
toRead.add(part);
}
}
if (toRead.size() > 0) {
// read the footers of the files that did not have a summary file
LOG.info("reading another {} footers", toRead.size());
result.addAll(readAllFootersInParallel(configuration, toRead, skipRowGroups));
}
return result;
}
Aggregations