use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.
the class TestParquetFileWriter method testWriteMetadataFileWithRelativeOutputPath.
/**
* {@link ParquetFileWriter#mergeFooters(Path, List)} expects a fully-qualified
* path for the root and crashes if a relative one is provided.
*/
@Test
public void testWriteMetadataFileWithRelativeOutputPath() throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Path relativeRoot = new Path("target/_test_relative");
Path qualifiedRoot = fs.makeQualified(relativeRoot);
ParquetMetadata mock = Mockito.mock(ParquetMetadata.class);
FileMetaData fileMetaData = new FileMetaData(new MessageType("root1", new PrimitiveType(REPEATED, BINARY, "a")), new HashMap<String, String>(), "test");
Mockito.when(mock.getFileMetaData()).thenReturn(fileMetaData);
List<Footer> footers = new ArrayList<Footer>();
Footer footer = new Footer(new Path(qualifiedRoot, "one"), mock);
footers.add(footer);
// This should not throw an exception
ParquetFileWriter.writeMetadataFile(conf, relativeRoot, footers, JobSummaryLevel.ALL);
}
use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.
the class TestParquetFileWriter method testMergeMetadataWithConflictingKeyValues.
@Test
public void testMergeMetadataWithConflictingKeyValues() {
Map<String, String> keyValues1 = new HashMap<String, String>() {
{
put("a", "b");
}
};
Map<String, String> keyValues2 = new HashMap<String, String>() {
{
put("a", "c");
}
};
FileMetaData md1 = new FileMetaData(new MessageType("root1", new PrimitiveType(REPEATED, BINARY, "a"), new PrimitiveType(OPTIONAL, BINARY, "b")), keyValues1, "test");
FileMetaData md2 = new FileMetaData(new MessageType("root1", new PrimitiveType(REPEATED, BINARY, "a"), new PrimitiveType(OPTIONAL, BINARY, "b")), keyValues2, "test");
GlobalMetaData merged = ParquetFileWriter.mergeInto(md2, ParquetFileWriter.mergeInto(md1, null));
try {
merged.merge(new StrictKeyValueMetadataMergeStrategy());
fail("Merge metadata is expected to fail because of conflicting key values");
} catch (RuntimeException e) {
// expected because of conflicting values
assertTrue(e.getMessage().contains("could not merge metadata"));
}
Map<String, String> mergedKeyValues = merged.merge(new ConcatenatingKeyValueMetadataMergeStrategy()).getKeyValueMetaData();
assertEquals(1, mergedKeyValues.size());
String mergedValue = mergedKeyValues.get("a");
assertTrue(mergedValue.equals("b,c") || mergedValue.equals("c,b"));
}
use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.
the class SchemaControlEncryptionTest method encryptParquetFile.
private String encryptParquetFile(String file, Configuration conf) throws IOException {
MessageType schema = new MessageType("schema", new PrimitiveType(REQUIRED, BINARY, "Name"), new PrimitiveType(REQUIRED, INT64, "Age"), new GroupType(OPTIONAL, "WebLinks", new PrimitiveType(REPEATED, BINARY, "LinkedIn"), new PrimitiveType(REPEATED, BINARY, "Twitter")));
conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());
Path path = new Path(file);
Builder builder = new Builder(path);
builder.withConf(conf);
try (ParquetWriter writer = builder.build()) {
for (int i = 0; i < 1000; i++) {
SimpleGroup g = new SimpleGroup(schema);
g.add("Name", (String) testData.get("Name")[i]);
g.add("Age", (Long) testData.get("Age")[i]);
Group links = g.addGroup("WebLinks");
links.add(0, (String) testData.get("LinkedIn")[i]);
links.add(1, (String) testData.get("Twitter")[i]);
writer.write(g);
}
}
return file;
}
use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.
the class ShowDictionaryCommand method run.
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
String source = targets.get(0);
try (ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source))) {
MessageType schema = reader.getFileMetaData().getSchema();
ColumnDescriptor descriptor = Util.descriptor(column, schema);
PrimitiveType type = Util.primitive(column, schema);
Preconditions.checkNotNull(type);
DictionaryPageReadStore dictionaryReader;
int rowGroup = 0;
while ((dictionaryReader = reader.getNextDictionaryReader()) != null) {
DictionaryPage page = dictionaryReader.readDictionaryPage(descriptor);
if (page != null) {
console.info("\nRow group {} dictionary for \"{}\":", rowGroup, column);
Dictionary dict = page.getEncoding().initDictionary(descriptor, page);
printDictionary(dict, type);
} else {
console.info("\nRow group {} has no dictionary for \"{}\"", rowGroup, column);
}
reader.skipNextRowGroup();
rowGroup += 1;
}
}
console.info("");
return 0;
}
use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.
the class ShowPagesCommand method run.
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
String source = targets.get(0);
try (ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source))) {
MessageType schema = reader.getFileMetaData().getSchema();
Map<ColumnDescriptor, PrimitiveType> columns = Maps.newLinkedHashMap();
if (this.columns == null || this.columns.isEmpty()) {
for (ColumnDescriptor descriptor : schema.getColumns()) {
columns.put(descriptor, primitive(schema, descriptor.getPath()));
}
} else {
for (String column : this.columns) {
columns.put(descriptor(column, schema), primitive(column, schema));
}
}
CompressionCodecName codec = reader.getRowGroups().get(0).getColumns().get(0).getCodec();
// accumulate formatted lines to print by column
Map<String, List<String>> formatted = Maps.newLinkedHashMap();
PageFormatter formatter = new PageFormatter();
PageReadStore pageStore;
int rowGroupNum = 0;
while ((pageStore = reader.readNextRowGroup()) != null) {
for (ColumnDescriptor descriptor : columns.keySet()) {
List<String> lines = formatted.get(columnName(descriptor));
if (lines == null) {
lines = Lists.newArrayList();
formatted.put(columnName(descriptor), lines);
}
formatter.setContext(rowGroupNum, columns.get(descriptor), codec);
PageReader pages = pageStore.getPageReader(descriptor);
DictionaryPage dict = pages.readDictionaryPage();
if (dict != null) {
lines.add(formatter.format(dict));
}
DataPage page;
while ((page = pages.readPage()) != null) {
lines.add(formatter.format(page));
}
}
rowGroupNum += 1;
}
// TODO: Show total column size and overall size per value in the column summary line
for (String columnName : formatted.keySet()) {
console.info(String.format("\nColumn: %s\n%s", columnName, new TextStringBuilder(80).appendPadding(80, '-')));
console.info(formatter.getHeader());
for (String line : formatted.get(columnName)) {
console.info(line);
}
console.info("");
}
}
return 0;
}
Aggregations