Search in sources :

Example 1 with ReadContext

use of org.apache.parquet.hadoop.api.ReadSupport.ReadContext in project parquet-mr by apache.

the class TupleConsumerPerfTest method read.

private static void read(PageReadStore columns, String pigSchemaString, String message) throws ParserException {
    System.out.println(message);
    MessageColumnIO columnIO = newColumnFactory(pigSchemaString);
    TupleReadSupport tupleReadSupport = new TupleReadSupport();
    Map<String, String> pigMetaData = pigMetaData(pigSchemaString);
    MessageType schema = new PigSchemaConverter().convert(Utils.getSchemaFromString(pigSchemaString));
    ReadContext init = tupleReadSupport.init(null, pigMetaData, schema);
    RecordMaterializer<Tuple> recordConsumer = tupleReadSupport.prepareForRead(null, pigMetaData, schema, init);
    RecordReader<Tuple> recordReader = columnIO.getRecordReader(columns, recordConsumer);
    // TODO: put this back
    // if (DEBUG) {
    // recordConsumer = new RecordConsumerLoggingWrapper(recordConsumer);
    // }
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 100000, pigSchemaString);
    read(recordReader, 1000000, pigSchemaString);
    System.out.println();
}
Also used : ReadContext(org.apache.parquet.hadoop.api.ReadSupport.ReadContext) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) MessageType(org.apache.parquet.schema.MessageType) Tuple(org.apache.pig.data.Tuple)

Example 2 with ReadContext

use of org.apache.parquet.hadoop.api.ReadSupport.ReadContext in project parquet-mr by apache.

the class ParquetRecordReaderWrapper method getSplit.

/**
 * gets a ParquetInputSplit corresponding to a split given by Hive
 *
 * @param oldSplit The split given by Hive
 * @param conf The JobConf of the Hive job
 * @return a ParquetInputSplit corresponding to the oldSplit
 * @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
 */
protected ParquetInputSplit getSplit(final InputSplit oldSplit, final JobConf conf) throws IOException {
    if (oldSplit instanceof FileSplit) {
        FileSplit fileSplit = (FileSplit) oldSplit;
        final long splitStart = fileSplit.getStart();
        final long splitLength = fileSplit.getLength();
        final Path finalPath = fileSplit.getPath();
        final JobConf cloneJob = hiveBinding.pushProjectionsAndFilters(conf, finalPath.getParent());
        final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath, SKIP_ROW_GROUPS);
        final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        final ReadContext readContext = new DataWritableReadSupport().init(cloneJob, fileMetaData.getKeyValueMetaData(), fileMetaData.getSchema());
        schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_SCHEMA_KEY)).getFieldCount();
        return new ParquetInputSplit(finalPath, splitStart, splitStart + splitLength, splitLength, fileSplit.getLocations(), null);
    } else {
        throw new IllegalArgumentException("Unknown split type: " + oldSplit);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ReadContext(org.apache.parquet.hadoop.api.ReadSupport.ReadContext) ParquetInputSplit(org.apache.parquet.hadoop.ParquetInputSplit) FileSplit(org.apache.hadoop.mapred.FileSplit) JobConf(org.apache.hadoop.mapred.JobConf) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData)

Example 3 with ReadContext

use of org.apache.parquet.hadoop.api.ReadSupport.ReadContext in project parquet-mr by apache.

the class TestTupleRecordConsumer method newPigRecordConsumer.

private RecordMaterializer<Tuple> newPigRecordConsumer(String pigSchemaString) throws ParserException {
    TupleReadSupport tupleReadSupport = new TupleReadSupport();
    final Configuration configuration = new Configuration(false);
    MessageType parquetSchema = getMessageType(pigSchemaString);
    final Map<String, String> pigMetaData = pigMetaData(pigSchemaString);
    Map<String, Set<String>> globalMetaData = new HashMap<String, Set<String>>();
    for (Entry<String, String> entry : pigMetaData.entrySet()) {
        globalMetaData.put(entry.getKey(), new HashSet<String>(Arrays.asList(entry.getValue())));
    }
    configuration.set(PARQUET_PIG_SCHEMA, pigSchemaString);
    final ReadContext init = tupleReadSupport.init(new InitContext(configuration, globalMetaData, parquetSchema));
    return tupleReadSupport.prepareForRead(configuration, pigMetaData, parquetSchema, init);
}
Also used : InitContext(org.apache.parquet.hadoop.api.InitContext) HashSet(java.util.HashSet) Set(java.util.Set) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) ReadContext(org.apache.parquet.hadoop.api.ReadSupport.ReadContext) MessageType(org.apache.parquet.schema.MessageType)

Example 4 with ReadContext

use of org.apache.parquet.hadoop.api.ReadSupport.ReadContext in project parquet-mr by apache.

the class ClientSideMetadataSplitStrategy method getSplits.

/**
 * @param configuration the configuration to connect to the file system
 * @param footers the footers of the files to read
 * @return the splits for the footers
 * @throws IOException
 * @deprecated split planning using file footers will be removed
 */
@Deprecated
public List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers) throws IOException {
    boolean strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
    final long maxSplitSize = configuration.getLong("mapred.max.split.size", Long.MAX_VALUE);
    final long minSplitSize = Math.max(getFormatMinSplitSize(), configuration.getLong("mapred.min.split.size", 0L));
    if (maxSplitSize < 0 || minSplitSize < 0) {
        throw new ParquetDecodingException("maxSplitSize or minSplitSize should not be negative: maxSplitSize = " + maxSplitSize + "; minSplitSize = " + minSplitSize);
    }
    GlobalMetaData globalMetaData = ParquetFileWriter.getGlobalMetaData(footers, strictTypeChecking);
    ReadContext readContext = getReadSupport(configuration).init(new InitContext(configuration, globalMetaData.getKeyValueMetaData(), globalMetaData.getSchema()));
    return new ClientSideMetadataSplitStrategy().getSplits(configuration, footers, maxSplitSize, minSplitSize, readContext);
}
Also used : InitContext(org.apache.parquet.hadoop.api.InitContext) ParquetDecodingException(org.apache.parquet.io.ParquetDecodingException) ReadContext(org.apache.parquet.hadoop.api.ReadSupport.ReadContext) GlobalMetaData(org.apache.parquet.hadoop.metadata.GlobalMetaData)

Aggregations

ReadContext (org.apache.parquet.hadoop.api.ReadSupport.ReadContext)4 InitContext (org.apache.parquet.hadoop.api.InitContext)2 MessageType (org.apache.parquet.schema.MessageType)2 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 Set (java.util.Set)1 Configuration (org.apache.hadoop.conf.Configuration)1 Path (org.apache.hadoop.fs.Path)1 FileSplit (org.apache.hadoop.mapred.FileSplit)1 JobConf (org.apache.hadoop.mapred.JobConf)1 ParquetInputSplit (org.apache.parquet.hadoop.ParquetInputSplit)1 FileMetaData (org.apache.parquet.hadoop.metadata.FileMetaData)1 GlobalMetaData (org.apache.parquet.hadoop.metadata.GlobalMetaData)1 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)1 MessageColumnIO (org.apache.parquet.io.MessageColumnIO)1 ParquetDecodingException (org.apache.parquet.io.ParquetDecodingException)1 Tuple (org.apache.pig.data.Tuple)1