use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.
the class SchemaControlEncryptionTest method encryptParquetFile.
private String encryptParquetFile(String file, Configuration conf) throws IOException {
MessageType schema = new MessageType("schema", new PrimitiveType(REQUIRED, BINARY, "Name"), new PrimitiveType(REQUIRED, INT64, "Age"), new GroupType(OPTIONAL, "WebLinks", new PrimitiveType(REPEATED, BINARY, "LinkedIn"), new PrimitiveType(REPEATED, BINARY, "Twitter")));
conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());
Path path = new Path(file);
Builder builder = new Builder(path);
builder.withConf(conf);
try (ParquetWriter writer = builder.build()) {
for (int i = 0; i < 1000; i++) {
SimpleGroup g = new SimpleGroup(schema);
g.add("Name", (String) testData.get("Name")[i]);
g.add("Age", (Long) testData.get("Age")[i]);
Group links = g.addGroup("WebLinks");
links.add(0, (String) testData.get("LinkedIn")[i]);
links.add(1, (String) testData.get("Twitter")[i]);
writer.write(g);
}
}
return file;
}
use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.
the class ShowDictionaryCommand method run.
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
String source = targets.get(0);
try (ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source))) {
MessageType schema = reader.getFileMetaData().getSchema();
ColumnDescriptor descriptor = Util.descriptor(column, schema);
PrimitiveType type = Util.primitive(column, schema);
Preconditions.checkNotNull(type);
DictionaryPageReadStore dictionaryReader;
int rowGroup = 0;
while ((dictionaryReader = reader.getNextDictionaryReader()) != null) {
DictionaryPage page = dictionaryReader.readDictionaryPage(descriptor);
if (page != null) {
console.info("\nRow group {} dictionary for \"{}\":", rowGroup, column);
Dictionary dict = page.getEncoding().initDictionary(descriptor, page);
printDictionary(dict, type);
} else {
console.info("\nRow group {} has no dictionary for \"{}\"", rowGroup, column);
}
reader.skipNextRowGroup();
rowGroup += 1;
}
}
console.info("");
return 0;
}
use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.
the class ShowPagesCommand method run.
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
String source = targets.get(0);
try (ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source))) {
MessageType schema = reader.getFileMetaData().getSchema();
Map<ColumnDescriptor, PrimitiveType> columns = Maps.newLinkedHashMap();
if (this.columns == null || this.columns.isEmpty()) {
for (ColumnDescriptor descriptor : schema.getColumns()) {
columns.put(descriptor, primitive(schema, descriptor.getPath()));
}
} else {
for (String column : this.columns) {
columns.put(descriptor(column, schema), primitive(column, schema));
}
}
CompressionCodecName codec = reader.getRowGroups().get(0).getColumns().get(0).getCodec();
// accumulate formatted lines to print by column
Map<String, List<String>> formatted = Maps.newLinkedHashMap();
PageFormatter formatter = new PageFormatter();
PageReadStore pageStore;
int rowGroupNum = 0;
while ((pageStore = reader.readNextRowGroup()) != null) {
for (ColumnDescriptor descriptor : columns.keySet()) {
List<String> lines = formatted.get(columnName(descriptor));
if (lines == null) {
lines = Lists.newArrayList();
formatted.put(columnName(descriptor), lines);
}
formatter.setContext(rowGroupNum, columns.get(descriptor), codec);
PageReader pages = pageStore.getPageReader(descriptor);
DictionaryPage dict = pages.readDictionaryPage();
if (dict != null) {
lines.add(formatter.format(dict));
}
DataPage page;
while ((page = pages.readPage()) != null) {
lines.add(formatter.format(page));
}
}
rowGroupNum += 1;
}
// TODO: Show total column size and overall size per value in the column summary line
for (String columnName : formatted.keySet()) {
console.info(String.format("\nColumn: %s\n%s", columnName, new TextStringBuilder(80).appendPadding(80, '-')));
console.info(formatter.getHeader());
for (String line : formatted.get(columnName)) {
console.info(line);
}
console.info("");
}
}
return 0;
}
use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.
the class TestAvroSchemaConverter method testTimestampMillisType.
@Test
public void testTimestampMillisType() throws Exception {
Schema date = LogicalTypes.timestampMillis().addToSchema(Schema.create(LONG));
Schema expected = Schema.createRecord("myrecord", null, null, false, Arrays.asList(new Schema.Field("timestamp", date, null, null)));
testRoundTripConversion(expected, "message myrecord {\n" + " required int64 timestamp (TIMESTAMP(MILLIS,true));\n" + "}\n");
for (PrimitiveTypeName primitive : new PrimitiveTypeName[] { INT32, INT96, FLOAT, DOUBLE, BOOLEAN, BINARY, FIXED_LEN_BYTE_ARRAY }) {
final PrimitiveType type;
if (primitive == FIXED_LEN_BYTE_ARRAY) {
type = new PrimitiveType(REQUIRED, primitive, 12, "test", TIMESTAMP_MILLIS);
} else {
type = new PrimitiveType(REQUIRED, primitive, "test", TIMESTAMP_MILLIS);
}
assertThrows("Should not allow TIMESTAMP_MILLIS with " + primitive, IllegalArgumentException.class, () -> new AvroSchemaConverter().convert(message(type)));
}
}
use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.
the class TestAvroSchemaConverter method testTimestampMicrosType.
@Test
public void testTimestampMicrosType() throws Exception {
Schema date = LogicalTypes.timestampMicros().addToSchema(Schema.create(LONG));
Schema expected = Schema.createRecord("myrecord", null, null, false, Arrays.asList(new Schema.Field("timestamp", date, null, null)));
testRoundTripConversion(expected, "message myrecord {\n" + " required int64 timestamp (TIMESTAMP(MICROS,true));\n" + "}\n");
for (PrimitiveTypeName primitive : new PrimitiveTypeName[] { INT32, INT96, FLOAT, DOUBLE, BOOLEAN, BINARY, FIXED_LEN_BYTE_ARRAY }) {
final PrimitiveType type;
if (primitive == FIXED_LEN_BYTE_ARRAY) {
type = new PrimitiveType(REQUIRED, primitive, 12, "test", TIMESTAMP_MICROS);
} else {
type = new PrimitiveType(REQUIRED, primitive, "test", TIMESTAMP_MICROS);
}
assertThrows("Should not allow TIMESTAMP_MICROS with " + primitive, IllegalArgumentException.class, () -> new AvroSchemaConverter().convert(message(type)));
}
}
Aggregations