Search in sources :

Example 1 with ParquetReader

use of org.apache.parquet.hadoop.ParquetReader in project nifi by apache.

the class PutParquetTest method verifyAvroParquetUsers.

private void verifyAvroParquetUsers(final Path avroParquetUsers, final int numExpectedUsers) throws IOException {
    final ParquetReader.Builder<GenericRecord> readerBuilder = AvroParquetReader.<GenericRecord>builder(avroParquetUsers).withConf(testConf);
    int currUser = 0;
    try (final ParquetReader<GenericRecord> reader = readerBuilder.build()) {
        GenericRecord nextRecord;
        while ((nextRecord = reader.read()) != null) {
            Assert.assertNotNull(nextRecord);
            Assert.assertEquals("name" + currUser, nextRecord.get("name").toString());
            Assert.assertEquals(currUser, nextRecord.get("favorite_number"));
            Assert.assertEquals("blue" + currUser, nextRecord.get("favorite_color").toString());
            currUser++;
        }
    }
    Assert.assertEquals(numExpectedUsers, currUser);
}
Also used : ParquetReader(org.apache.parquet.hadoop.ParquetReader) AvroParquetReader(org.apache.parquet.avro.AvroParquetReader) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 2 with ParquetReader

use of org.apache.parquet.hadoop.ParquetReader in project parquet-mr by apache.

the class TestThriftToParquetFileWriter method createRecordReader.

private ParquetReader<Group> createRecordReader(Path parquetFilePath) throws IOException {
    Configuration configuration = new Configuration(true);
    GroupReadSupport readSupport = new GroupReadSupport();
    ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, parquetFilePath);
    MessageType schema = readFooter.getFileMetaData().getSchema();
    readSupport.init(configuration, null, schema);
    return new ParquetReader<Group>(parquetFilePath, readSupport);
}
Also used : GroupReadSupport(org.apache.parquet.hadoop.example.GroupReadSupport) Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ParquetReader(org.apache.parquet.hadoop.ParquetReader) MessageType(org.apache.parquet.schema.MessageType)

Example 3 with ParquetReader

use of org.apache.parquet.hadoop.ParquetReader in project Gaffer by gchq.

the class CalculatePartitioner method call.

public GraphPartitioner call() throws IOException {
    final SchemaUtils schemaUtils = new SchemaUtils(schema);
    final GraphPartitioner graphPartitioner = new GraphPartitioner();
    for (final String group : schema.getGroups()) {
        LOGGER.info("Calculating GroupPartitioner for group {}", group);
        final GafferGroupObjectConverter converter = schemaUtils.getConverter(group);
        final List<PartitionKey> partitionKeys = new ArrayList<>();
        final Path groupPath = new Path(path, ParquetStore.getGroupSubDir(group, false));
        final FileStatus[] files = fs.listStatus(groupPath, p -> p.getName().endsWith(".parquet"));
        final SortedSet<Path> sortedFiles = new TreeSet<>();
        Arrays.stream(files).map(f -> f.getPath()).forEach(sortedFiles::add);
        final Path[] sortedPaths = sortedFiles.toArray(new Path[] {});
        LOGGER.debug("Found {} files in {}", files.length, groupPath);
        for (int i = 1; i < sortedPaths.length; i++) {
            // NB Skip first file
            LOGGER.debug("Reading first line of {}", sortedPaths[i]);
            final ParquetReader<Element> reader = new ParquetElementReader.Builder<Element>(sortedPaths[i]).isEntity(schema.getEntityGroups().contains(group)).usingConverter(converter).build();
            // NB Should never be null as empty files are removed before this is called
            final Element element = reader.read();
            if (null == element) {
                throw new IOException("No first element in file " + files[i].getPath() + " - empty files are supposed to be removed");
            }
            reader.close();
            final Object[] parquetObjects = converter.corePropertiesToParquetObjects(element);
            final PartitionKey key = new PartitionKey(parquetObjects);
            partitionKeys.add(key);
        }
        final GroupPartitioner groupPartitioner = new GroupPartitioner(group, partitionKeys);
        graphPartitioner.addGroupPartitioner(group, groupPartitioner);
        LOGGER.info("GroupPartitioner for group {} is {}", group, groupPartitioner);
    }
    for (final String group : schema.getEdgeGroups()) {
        LOGGER.info("Calculating GroupPartitioner for reversed edge group {}", group);
        final GafferGroupObjectConverter converter = schemaUtils.getConverter(group);
        final List<PartitionKey> partitionKeys = new ArrayList<>();
        final Path groupPath = new Path(path, ParquetStore.getGroupSubDir(group, true));
        final FileStatus[] files = fs.listStatus(groupPath, p -> p.getName().endsWith(".parquet"));
        final SortedSet<Path> sortedFiles = new TreeSet<>();
        Arrays.stream(files).map(f -> f.getPath()).forEach(sortedFiles::add);
        final Path[] sortedPaths = sortedFiles.toArray(new Path[] {});
        LOGGER.debug("Found {} files in {}", files.length, groupPath);
        for (int i = 1; i < sortedPaths.length; i++) {
            // NB Skip first file
            LOGGER.debug("Reading first line of {}", sortedPaths[i]);
            final ParquetReader<Element> reader = new ParquetElementReader.Builder<Element>(sortedPaths[i]).isEntity(false).usingConverter(converter).build();
            final Edge edge = (Edge) reader.read();
            if (null == edge) {
                throw new IOException("No first edge in file " + files[i].getPath() + " - empty files are supposed to be removed");
            }
            reader.close();
            final Object[] parquetObjects = converter.corePropertiesToParquetObjectsForReversedEdge(edge);
            final PartitionKey key = new PartitionKey(parquetObjects);
            partitionKeys.add(key);
        }
        final GroupPartitioner groupPartitioner = new GroupPartitioner(group, partitionKeys);
        graphPartitioner.addGroupPartitionerForReversedEdges(group, groupPartitioner);
    }
    return graphPartitioner;
}
Also used : Arrays(java.util.Arrays) Logger(org.slf4j.Logger) ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) SortedSet(java.util.SortedSet) FileSystem(org.apache.hadoop.fs.FileSystem) ParquetReader(org.apache.parquet.hadoop.ParquetReader) LoggerFactory(org.slf4j.LoggerFactory) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) IOException(java.io.IOException) FileStatus(org.apache.hadoop.fs.FileStatus) ParquetStore(uk.gov.gchq.gaffer.parquetstore.ParquetStore) Element(uk.gov.gchq.gaffer.data.element.Element) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) List(java.util.List) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) Schema(uk.gov.gchq.gaffer.store.schema.Schema) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) Path(org.apache.hadoop.fs.Path) Edge(uk.gov.gchq.gaffer.data.element.Edge) GafferGroupObjectConverter(uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter) FileStatus(org.apache.hadoop.fs.FileStatus) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) SchemaUtils(uk.gov.gchq.gaffer.parquetstore.utils.SchemaUtils) GraphPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GraphPartitioner) TreeSet(java.util.TreeSet) Path(org.apache.hadoop.fs.Path) GroupPartitioner(uk.gov.gchq.gaffer.parquetstore.partitioner.GroupPartitioner) ParquetElementReader(uk.gov.gchq.gaffer.parquetstore.io.reader.ParquetElementReader) IOException(java.io.IOException) GafferGroupObjectConverter(uk.gov.gchq.gaffer.parquetstore.utils.GafferGroupObjectConverter) PartitionKey(uk.gov.gchq.gaffer.parquetstore.partitioner.PartitionKey) Edge(uk.gov.gchq.gaffer.data.element.Edge)

Aggregations

ParquetReader (org.apache.parquet.hadoop.ParquetReader)3 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 List (java.util.List)1 SortedSet (java.util.SortedSet)1 TreeSet (java.util.TreeSet)1 GenericRecord (org.apache.avro.generic.GenericRecord)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 AvroParquetReader (org.apache.parquet.avro.AvroParquetReader)1 GroupReadSupport (org.apache.parquet.hadoop.example.GroupReadSupport)1 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)1 MessageType (org.apache.parquet.schema.MessageType)1 Logger (org.slf4j.Logger)1 LoggerFactory (org.slf4j.LoggerFactory)1 Edge (uk.gov.gchq.gaffer.data.element.Edge)1 Element (uk.gov.gchq.gaffer.data.element.Element)1