use of org.apache.parquet.hadoop.api.ReadSupport.ReadContext in project parquet-mr by apache.
the class TupleConsumerPerfTest method read.
private static void read(PageReadStore columns, String pigSchemaString, String message) throws ParserException {
System.out.println(message);
MessageColumnIO columnIO = newColumnFactory(pigSchemaString);
TupleReadSupport tupleReadSupport = new TupleReadSupport();
Map<String, String> pigMetaData = pigMetaData(pigSchemaString);
MessageType schema = new PigSchemaConverter().convert(Utils.getSchemaFromString(pigSchemaString));
ReadContext init = tupleReadSupport.init(null, pigMetaData, schema);
RecordMaterializer<Tuple> recordConsumer = tupleReadSupport.prepareForRead(null, pigMetaData, schema, init);
RecordReader<Tuple> recordReader = columnIO.getRecordReader(columns, recordConsumer);
// TODO: put this back
// if (DEBUG) {
// recordConsumer = new RecordConsumerLoggingWrapper(recordConsumer);
// }
read(recordReader, 10000, pigSchemaString);
read(recordReader, 10000, pigSchemaString);
read(recordReader, 10000, pigSchemaString);
read(recordReader, 10000, pigSchemaString);
read(recordReader, 10000, pigSchemaString);
read(recordReader, 100000, pigSchemaString);
read(recordReader, 1000000, pigSchemaString);
System.out.println();
}
use of org.apache.parquet.hadoop.api.ReadSupport.ReadContext in project parquet-mr by apache.
the class ParquetRecordReaderWrapper method getSplit.
/**
* gets a ParquetInputSplit corresponding to a split given by Hive
*
* @param oldSplit The split given by Hive
* @param conf The JobConf of the Hive job
* @return a ParquetInputSplit corresponding to the oldSplit
* @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
*/
protected ParquetInputSplit getSplit(final InputSplit oldSplit, final JobConf conf) throws IOException {
if (oldSplit instanceof FileSplit) {
FileSplit fileSplit = (FileSplit) oldSplit;
final long splitStart = fileSplit.getStart();
final long splitLength = fileSplit.getLength();
final Path finalPath = fileSplit.getPath();
final JobConf cloneJob = hiveBinding.pushProjectionsAndFilters(conf, finalPath.getParent());
final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath, SKIP_ROW_GROUPS);
final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
final ReadContext readContext = new DataWritableReadSupport().init(cloneJob, fileMetaData.getKeyValueMetaData(), fileMetaData.getSchema());
schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_SCHEMA_KEY)).getFieldCount();
return new ParquetInputSplit(finalPath, splitStart, splitStart + splitLength, splitLength, fileSplit.getLocations(), null);
} else {
throw new IllegalArgumentException("Unknown split type: " + oldSplit);
}
}
use of org.apache.parquet.hadoop.api.ReadSupport.ReadContext in project parquet-mr by apache.
the class TestTupleRecordConsumer method newPigRecordConsumer.
private RecordMaterializer<Tuple> newPigRecordConsumer(String pigSchemaString) throws ParserException {
TupleReadSupport tupleReadSupport = new TupleReadSupport();
final Configuration configuration = new Configuration(false);
MessageType parquetSchema = getMessageType(pigSchemaString);
final Map<String, String> pigMetaData = pigMetaData(pigSchemaString);
Map<String, Set<String>> globalMetaData = new HashMap<String, Set<String>>();
for (Entry<String, String> entry : pigMetaData.entrySet()) {
globalMetaData.put(entry.getKey(), new HashSet<String>(Arrays.asList(entry.getValue())));
}
configuration.set(PARQUET_PIG_SCHEMA, pigSchemaString);
final ReadContext init = tupleReadSupport.init(new InitContext(configuration, globalMetaData, parquetSchema));
return tupleReadSupport.prepareForRead(configuration, pigMetaData, parquetSchema, init);
}
use of org.apache.parquet.hadoop.api.ReadSupport.ReadContext in project parquet-mr by apache.
the class ClientSideMetadataSplitStrategy method getSplits.
/**
* @param configuration the configuration to connect to the file system
* @param footers the footers of the files to read
* @return the splits for the footers
* @throws IOException
* @deprecated split planning using file footers will be removed
*/
@Deprecated
public List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers) throws IOException {
boolean strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
final long maxSplitSize = configuration.getLong("mapred.max.split.size", Long.MAX_VALUE);
final long minSplitSize = Math.max(getFormatMinSplitSize(), configuration.getLong("mapred.min.split.size", 0L));
if (maxSplitSize < 0 || minSplitSize < 0) {
throw new ParquetDecodingException("maxSplitSize or minSplitSize should not be negative: maxSplitSize = " + maxSplitSize + "; minSplitSize = " + minSplitSize);
}
GlobalMetaData globalMetaData = ParquetFileWriter.getGlobalMetaData(footers, strictTypeChecking);
ReadContext readContext = getReadSupport(configuration).init(new InitContext(configuration, globalMetaData.getKeyValueMetaData(), globalMetaData.getSchema()));
return new ClientSideMetadataSplitStrategy().getSplits(configuration, footers, maxSplitSize, minSplitSize, readContext);
}
Aggregations