use of org.apache.parquet.hadoop.ParquetFileReader in project parquet-mr by apache.
the class CheckParquet251Command method check.
private String check(String file) throws IOException {
Path path = qualifiedPath(file);
ParquetMetadata footer = ParquetFileReader.readFooter(getConf(), path, ParquetMetadataConverter.NO_FILTER);
FileMetaData meta = footer.getFileMetaData();
String createdBy = meta.getCreatedBy();
if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
// create fake metadata that will read corrupt stats and return them
FileMetaData fakeMeta = new FileMetaData(meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);
// get just the binary columns
List<ColumnDescriptor> columns = Lists.newArrayList();
Iterables.addAll(columns, Iterables.filter(meta.getSchema().getColumns(), new Predicate<ColumnDescriptor>() {
@Override
public boolean apply(@Nullable ColumnDescriptor input) {
return input != null && input.getType() == BINARY;
}
}));
// now check to see if the data is actually corrupt
ParquetFileReader reader = new ParquetFileReader(getConf(), fakeMeta, path, footer.getBlocks(), columns);
try {
PageStatsValidator validator = new PageStatsValidator();
for (PageReadStore pages = reader.readNextRowGroup(); pages != null; pages = reader.readNextRowGroup()) {
validator.validate(columns, pages);
}
} catch (BadStatsException e) {
return e.getMessage();
}
}
return null;
}
use of org.apache.parquet.hadoop.ParquetFileReader in project hive by apache.
the class VectorizedParquetRecordReader method initialize.
@SuppressWarnings("deprecation")
public void initialize(InputSplit oldSplit, JobConf configuration) throws IOException, InterruptedException, HiveException {
// the oldSplit may be null during the split phase
if (oldSplit == null) {
return;
}
ParquetMetadata footer;
List<BlockMetaData> blocks;
MapWork mapWork = LlapHiveUtils.findMapWork(jobConf);
if (mapWork != null) {
parts = mapWork.getPathToPartitionInfo();
}
ParquetInputSplit split = (ParquetInputSplit) oldSplit;
boolean indexAccess = configuration.getBoolean(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS, false);
this.file = split.getPath();
long[] rowGroupOffsets = split.getRowGroupOffsets();
String columnNames = configuration.get(IOConstants.COLUMNS);
columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
// if task.side.metadata is set, rowGroupOffsets is null
Object cacheKey = null;
CacheTag cacheTag = null;
// TODO: also support fileKey in splits, like OrcSplit does
if (metadataCache != null) {
if (cacheKey == null) {
cacheKey = HdfsUtils.getFileId(file.getFileSystem(configuration), file, HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_ALLOW_SYNTHETIC_FILEID), HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID), !HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_IO_USE_FILEID_PATH));
}
}
if (cacheKey != null) {
if (HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_TRACK_CACHE_USAGE)) {
PartitionDesc partitionDesc = LlapHiveUtils.partitionDescForPath(split.getPath(), parts);
cacheTag = LlapHiveUtils.getDbAndTableNameForMetrics(file, true, partitionDesc);
}
// If we are going to use cache, change the path to depend on file ID for extra consistency.
FileSystem fs = file.getFileSystem(configuration);
if (cacheKey instanceof Long && HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_IO_USE_FILEID_PATH)) {
file = HdfsUtils.getFileIdPath(file, (long) cacheKey);
}
}
if (rowGroupOffsets == null) {
// TODO check whether rowGroupOffSets can be null
// then we need to apply the predicate push down filter
footer = readSplitFooter(configuration, file, cacheKey, range(split.getStart(), split.getEnd()), cacheTag);
MessageType fileSchema = footer.getFileMetaData().getSchema();
FilterCompat.Filter filter = getFilter(configuration);
blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
} else {
// otherwise we find the row groups that were selected on the client
footer = readSplitFooter(configuration, file, cacheKey, NO_FILTER, cacheTag);
Set<Long> offsets = new HashSet<>();
for (long offset : rowGroupOffsets) {
offsets.add(offset);
}
blocks = new ArrayList<>();
for (BlockMetaData block : footer.getBlocks()) {
if (offsets.contains(block.getStartingPos())) {
blocks.add(block);
}
}
// verify we found them all
if (blocks.size() != rowGroupOffsets.length) {
long[] foundRowGroupOffsets = new long[footer.getBlocks().size()];
for (int i = 0; i < foundRowGroupOffsets.length; i++) {
foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos();
}
// provide a good error message in case there's a bug
throw new IllegalStateException("All the offsets listed in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
}
}
for (BlockMetaData block : blocks) {
this.totalRowCount += block.getRowCount();
}
this.fileSchema = footer.getFileMetaData().getSchema();
this.writerTimezone = DataWritableReadSupport.getWriterTimeZoneId(footer.getFileMetaData().getKeyValueMetaData());
colsToInclude = ColumnProjectionUtils.getReadColumnIDs(configuration);
requestedSchema = DataWritableReadSupport.getRequestedSchema(indexAccess, columnNamesList, columnTypesList, fileSchema, configuration);
Path path = wrapPathForCache(file, cacheKey, configuration, blocks, cacheTag);
this.reader = new ParquetFileReader(configuration, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());
}
use of org.apache.parquet.hadoop.ParquetFileReader in project parquet-mr by apache.
the class DumpCommand method dump.
public static void dump(PrettyPrintWriter out, ParquetMetadata meta, MessageType schema, Path inpath, boolean showmd, boolean showdt, Set<String> showColumns) throws IOException {
Configuration conf = new Configuration();
List<BlockMetaData> blocks = meta.getBlocks();
List<ColumnDescriptor> columns = schema.getColumns();
if (showColumns != null) {
columns = new ArrayList<ColumnDescriptor>();
for (ColumnDescriptor column : schema.getColumns()) {
String path = Joiner.on('.').skipNulls().join(column.getPath());
if (showColumns.contains(path)) {
columns.add(column);
}
}
}
ParquetFileReader freader = null;
if (showmd) {
try {
long group = 0;
for (BlockMetaData block : blocks) {
if (group != 0)
out.println();
out.format("row group %d%n", group++);
out.rule('-');
List<ColumnChunkMetaData> ccmds = block.getColumns();
if (showColumns != null) {
ccmds = new ArrayList<ColumnChunkMetaData>();
for (ColumnChunkMetaData ccmd : block.getColumns()) {
String path = Joiner.on('.').skipNulls().join(ccmd.getPath().toArray());
if (showColumns.contains(path)) {
ccmds.add(ccmd);
}
}
}
MetadataUtils.showDetails(out, ccmds);
List<BlockMetaData> rblocks = Collections.singletonList(block);
freader = new ParquetFileReader(conf, meta.getFileMetaData(), inpath, rblocks, columns);
PageReadStore store = freader.readNextRowGroup();
while (store != null) {
out.incrementTabLevel();
for (ColumnDescriptor column : columns) {
out.println();
dump(out, store, column);
}
out.decrementTabLevel();
store = freader.readNextRowGroup();
}
out.flushColumns();
}
} finally {
if (freader != null) {
freader.close();
}
}
}
if (showdt) {
boolean first = true;
for (ColumnDescriptor column : columns) {
if (!first || showmd)
out.println();
first = false;
out.format("%s %s%n", column.getType(), Joiner.on('.').skipNulls().join(column.getPath()));
out.rule('-');
try {
long page = 1;
long total = blocks.size();
long offset = 1;
freader = new ParquetFileReader(conf, meta.getFileMetaData(), inpath, blocks, Collections.singletonList(column));
PageReadStore store = freader.readNextRowGroup();
while (store != null) {
ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DumpGroupConverter(), schema, meta.getFileMetaData().getCreatedBy());
dump(out, crstore, column, page++, total, offset);
offset += store.getRowCount();
store = freader.readNextRowGroup();
}
out.flushColumns();
} finally {
out.flushColumns();
if (freader != null) {
freader.close();
}
}
}
}
}
use of org.apache.parquet.hadoop.ParquetFileReader in project hive by apache.
the class ParquetRecordWriterWrapper method close.
@Override
public void close(final Reporter reporter) throws IOException {
try {
realWriter.close(taskContext);
} catch (final InterruptedException e) {
throw new IOException(e);
}
// Collect file stats
try {
ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(this.file, this.jobConf));
long totalSize = 0;
for (BlockMetaData block : reader.getFooter().getBlocks()) {
totalSize += block.getTotalByteSize();
}
stats = new SerDeStats();
stats.setRowCount(reader.getRecordCount());
stats.setRawDataSize(totalSize);
} catch (IOException e) {
// Ignore
}
}
use of org.apache.parquet.hadoop.ParquetFileReader in project ignite by apache.
the class SparkModelParser method loadKMeansModel.
/**
* Load K-Means model.
*
* @param pathToMdl Path to model.
* @param learningEnvironment learningEnvironment
*/
private static Model loadKMeansModel(String pathToMdl, LearningEnvironment learningEnvironment) {
Vector[] centers = null;
try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
PageReadStore pages;
final MessageType schema = r.getFooter().getFileMetaData().getSchema();
final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
while (null != (pages = r.readNextRowGroup())) {
final int rows = (int) pages.getRowCount();
final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
centers = new DenseVector[rows];
for (int i = 0; i < rows; i++) {
final SimpleGroup g = (SimpleGroup) recordReader.read();
// final int clusterIdx = g.getInteger(0, 0);
Group clusterCenterCoeff = g.getGroup(1, 0).getGroup(3, 0);
final int amountOfCoefficients = clusterCenterCoeff.getFieldRepetitionCount(0);
centers[i] = new DenseVector(amountOfCoefficients);
for (int j = 0; j < amountOfCoefficients; j++) {
double coefficient = clusterCenterCoeff.getGroup(0, j).getDouble(0, 0);
centers[i].set(j, coefficient);
}
}
}
} catch (IOException e) {
String msg = "Error reading parquet file: " + e.getMessage();
learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
e.printStackTrace();
}
return new KMeansModel(centers, new EuclideanDistance());
}
Aggregations