Search in sources :

Example 21 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class TestParquetFileWriter method testWriteMetadataFileWithRelativeOutputPath.

/**
 * {@link ParquetFileWriter#mergeFooters(Path, List)} expects a fully-qualified
 * path for the root and crashes if a relative one is provided.
 */
@Test
public void testWriteMetadataFileWithRelativeOutputPath() throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path relativeRoot = new Path("target/_test_relative");
    Path qualifiedRoot = fs.makeQualified(relativeRoot);
    ParquetMetadata mock = Mockito.mock(ParquetMetadata.class);
    FileMetaData fileMetaData = new FileMetaData(new MessageType("root1", new PrimitiveType(REPEATED, BINARY, "a")), new HashMap<String, String>(), "test");
    Mockito.when(mock.getFileMetaData()).thenReturn(fileMetaData);
    List<Footer> footers = new ArrayList<Footer>();
    Footer footer = new Footer(new Path(qualifiedRoot, "one"), mock);
    footers.add(footer);
    // This should not throw an exception
    ParquetFileWriter.writeMetadataFile(conf, relativeRoot, footers, JobSummaryLevel.ALL);
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) PrimitiveType(org.apache.parquet.schema.PrimitiveType) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 22 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class TestParquetFileWriter method testMergeMetadataWithConflictingKeyValues.

@Test
public void testMergeMetadataWithConflictingKeyValues() {
    Map<String, String> keyValues1 = new HashMap<String, String>() {

        {
            put("a", "b");
        }
    };
    Map<String, String> keyValues2 = new HashMap<String, String>() {

        {
            put("a", "c");
        }
    };
    FileMetaData md1 = new FileMetaData(new MessageType("root1", new PrimitiveType(REPEATED, BINARY, "a"), new PrimitiveType(OPTIONAL, BINARY, "b")), keyValues1, "test");
    FileMetaData md2 = new FileMetaData(new MessageType("root1", new PrimitiveType(REPEATED, BINARY, "a"), new PrimitiveType(OPTIONAL, BINARY, "b")), keyValues2, "test");
    GlobalMetaData merged = ParquetFileWriter.mergeInto(md2, ParquetFileWriter.mergeInto(md1, null));
    try {
        merged.merge(new StrictKeyValueMetadataMergeStrategy());
        fail("Merge metadata is expected to fail because of conflicting key values");
    } catch (RuntimeException e) {
        // expected because of conflicting values
        assertTrue(e.getMessage().contains("could not merge metadata"));
    }
    Map<String, String> mergedKeyValues = merged.merge(new ConcatenatingKeyValueMetadataMergeStrategy()).getKeyValueMetaData();
    assertEquals(1, mergedKeyValues.size());
    String mergedValue = mergedKeyValues.get("a");
    assertTrue(mergedValue.equals("b,c") || mergedValue.equals("c,b"));
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 23 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class SchemaControlEncryptionTest method encryptParquetFile.

private String encryptParquetFile(String file, Configuration conf) throws IOException {
    MessageType schema = new MessageType("schema", new PrimitiveType(REQUIRED, BINARY, "Name"), new PrimitiveType(REQUIRED, INT64, "Age"), new GroupType(OPTIONAL, "WebLinks", new PrimitiveType(REPEATED, BINARY, "LinkedIn"), new PrimitiveType(REPEATED, BINARY, "Twitter")));
    conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());
    Path path = new Path(file);
    Builder builder = new Builder(path);
    builder.withConf(conf);
    try (ParquetWriter writer = builder.build()) {
        for (int i = 0; i < 1000; i++) {
            SimpleGroup g = new SimpleGroup(schema);
            g.add("Name", (String) testData.get("Name")[i]);
            g.add("Age", (Long) testData.get("Age")[i]);
            Group links = g.addGroup("WebLinks");
            links.add(0, (String) testData.get("LinkedIn")[i]);
            links.add(1, (String) testData.get("Twitter")[i]);
            writer.write(g);
        }
    }
    return file;
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) SimpleGroup(org.apache.parquet.example.data.simple.SimpleGroup) GroupType(org.apache.parquet.schema.GroupType) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) SimpleGroup(org.apache.parquet.example.data.simple.SimpleGroup) PrimitiveType(org.apache.parquet.schema.PrimitiveType) MessageType(org.apache.parquet.schema.MessageType)

Example 24 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class ShowDictionaryCommand method run.

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
    Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
    Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
    String source = targets.get(0);
    try (ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source))) {
        MessageType schema = reader.getFileMetaData().getSchema();
        ColumnDescriptor descriptor = Util.descriptor(column, schema);
        PrimitiveType type = Util.primitive(column, schema);
        Preconditions.checkNotNull(type);
        DictionaryPageReadStore dictionaryReader;
        int rowGroup = 0;
        while ((dictionaryReader = reader.getNextDictionaryReader()) != null) {
            DictionaryPage page = dictionaryReader.readDictionaryPage(descriptor);
            if (page != null) {
                console.info("\nRow group {} dictionary for \"{}\":", rowGroup, column);
                Dictionary dict = page.getEncoding().initDictionary(descriptor, page);
                printDictionary(dict, type);
            } else {
                console.info("\nRow group {} has no dictionary for \"{}\"", rowGroup, column);
            }
            reader.skipNextRowGroup();
            rowGroup += 1;
        }
    }
    console.info("");
    return 0;
}
Also used : Dictionary(org.apache.parquet.column.Dictionary) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PrimitiveType(org.apache.parquet.schema.PrimitiveType) DictionaryPageReadStore(org.apache.parquet.column.page.DictionaryPageReadStore) MessageType(org.apache.parquet.schema.MessageType) DictionaryPage(org.apache.parquet.column.page.DictionaryPage)

Example 25 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class ShowPagesCommand method run.

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
    Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
    Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
    String source = targets.get(0);
    try (ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source))) {
        MessageType schema = reader.getFileMetaData().getSchema();
        Map<ColumnDescriptor, PrimitiveType> columns = Maps.newLinkedHashMap();
        if (this.columns == null || this.columns.isEmpty()) {
            for (ColumnDescriptor descriptor : schema.getColumns()) {
                columns.put(descriptor, primitive(schema, descriptor.getPath()));
            }
        } else {
            for (String column : this.columns) {
                columns.put(descriptor(column, schema), primitive(column, schema));
            }
        }
        CompressionCodecName codec = reader.getRowGroups().get(0).getColumns().get(0).getCodec();
        // accumulate formatted lines to print by column
        Map<String, List<String>> formatted = Maps.newLinkedHashMap();
        PageFormatter formatter = new PageFormatter();
        PageReadStore pageStore;
        int rowGroupNum = 0;
        while ((pageStore = reader.readNextRowGroup()) != null) {
            for (ColumnDescriptor descriptor : columns.keySet()) {
                List<String> lines = formatted.get(columnName(descriptor));
                if (lines == null) {
                    lines = Lists.newArrayList();
                    formatted.put(columnName(descriptor), lines);
                }
                formatter.setContext(rowGroupNum, columns.get(descriptor), codec);
                PageReader pages = pageStore.getPageReader(descriptor);
                DictionaryPage dict = pages.readDictionaryPage();
                if (dict != null) {
                    lines.add(formatter.format(dict));
                }
                DataPage page;
                while ((page = pages.readPage()) != null) {
                    lines.add(formatter.format(page));
                }
            }
            rowGroupNum += 1;
        }
        // TODO: Show total column size and overall size per value in the column summary line
        for (String columnName : formatted.keySet()) {
            console.info(String.format("\nColumn: %s\n%s", columnName, new TextStringBuilder(80).appendPadding(80, '-')));
            console.info(formatter.getHeader());
            for (String line : formatted.get(columnName)) {
                console.info(line);
            }
            console.info("");
        }
    }
    return 0;
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageReader(org.apache.parquet.column.page.PageReader) Util.minMaxAsString(org.apache.parquet.cli.Util.minMaxAsString) Util.encodingAsString(org.apache.parquet.cli.Util.encodingAsString) TextStringBuilder(org.apache.commons.text.TextStringBuilder) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) PageReadStore(org.apache.parquet.column.page.PageReadStore) PrimitiveType(org.apache.parquet.schema.PrimitiveType) List(java.util.List) MessageType(org.apache.parquet.schema.MessageType) DictionaryPage(org.apache.parquet.column.page.DictionaryPage)

Aggregations

PrimitiveType (org.apache.parquet.schema.PrimitiveType)123 Test (org.junit.Test)66 MessageType (org.apache.parquet.schema.MessageType)41 HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)28 BooleanWritable (org.apache.hadoop.io.BooleanWritable)28 BytesWritable (org.apache.hadoop.io.BytesWritable)28 DoubleWritable (org.apache.hadoop.io.DoubleWritable)28 FloatWritable (org.apache.hadoop.io.FloatWritable)28 IntWritable (org.apache.hadoop.io.IntWritable)28 LongWritable (org.apache.hadoop.io.LongWritable)28 Writable (org.apache.hadoop.io.Writable)28 GroupType (org.apache.parquet.schema.GroupType)25 Test (org.testng.annotations.Test)20 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)14 OriginalType (org.apache.parquet.schema.OriginalType)14 Type (org.apache.parquet.schema.Type)12 List (java.util.List)11 ColumnIndex (org.apache.parquet.internal.column.columnindex.ColumnIndex)11 ColumnIndexBuilder (org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder)11 ArrayList (java.util.ArrayList)10