use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class CheckParquet251Command method check.
private String check(String file) throws IOException {
Path path = qualifiedPath(file);
ParquetMetadata footer = ParquetFileReader.readFooter(getConf(), path, ParquetMetadataConverter.NO_FILTER);
FileMetaData meta = footer.getFileMetaData();
String createdBy = meta.getCreatedBy();
if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
// create fake metadata that will read corrupt stats and return them
FileMetaData fakeMeta = new FileMetaData(meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);
// get just the binary columns
List<ColumnDescriptor> columns = Lists.newArrayList();
Iterables.addAll(columns, Iterables.filter(meta.getSchema().getColumns(), new Predicate<ColumnDescriptor>() {
@Override
public boolean apply(@Nullable ColumnDescriptor input) {
return input != null && input.getType() == BINARY;
}
}));
// now check to see if the data is actually corrupt
ParquetFileReader reader = new ParquetFileReader(getConf(), fakeMeta, path, footer.getBlocks(), columns);
try {
PageStatsValidator validator = new PageStatsValidator();
for (PageReadStore pages = reader.readNextRowGroup(); pages != null; pages = reader.readNextRowGroup()) {
validator.validate(columns, pages);
}
} catch (BadStatsException e) {
return e.getMessage();
}
}
return null;
}
use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class TestMemColumn method testMemColumn.
@Test
public void testMemColumn() throws Exception {
MessageType schema = MessageTypeParser.parseMessageType("message msg { required group foo { required int64 bar; } }");
ColumnDescriptor path = schema.getColumnDescription(new String[] { "foo", "bar" });
MemPageStore memPageStore = new MemPageStore(10);
ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore);
ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path);
columnWriter.write(42l, 0, 0);
memColumnsStore.flush();
ColumnReader columnReader = getColumnReader(memPageStore, path, schema);
for (int i = 0; i < columnReader.getTotalValueCount(); i++) {
assertEquals(columnReader.getCurrentRepetitionLevel(), 0);
assertEquals(columnReader.getCurrentDefinitionLevel(), 0);
assertEquals(columnReader.getLong(), 42);
columnReader.consume();
}
}
use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class TestMemColumn method testMemColumnSeveralPages.
@Test
public void testMemColumnSeveralPages() throws Exception {
MessageType mt = MessageTypeParser.parseMessageType("message msg { required group foo { required int64 bar; } }");
String[] col = new String[] { "foo", "bar" };
MemPageStore memPageStore = new MemPageStore(10);
ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore);
ColumnDescriptor path1 = mt.getColumnDescription(col);
ColumnDescriptor path = path1;
ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path);
for (int i = 0; i < 2000; i++) {
columnWriter.write(42l, 0, 0);
}
memColumnsStore.flush();
ColumnReader columnReader = getColumnReader(memPageStore, path, mt);
for (int i = 0; i < columnReader.getTotalValueCount(); i++) {
assertEquals(columnReader.getCurrentRepetitionLevel(), 0);
assertEquals(columnReader.getCurrentDefinitionLevel(), 0);
assertEquals(columnReader.getLong(), 42);
columnReader.consume();
}
}
use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class TestMemColumn method testMemColumnSeveralPagesRepeated.
@Test
public void testMemColumnSeveralPagesRepeated() throws Exception {
MessageType mt = MessageTypeParser.parseMessageType("message msg { repeated group foo { repeated int64 bar; } }");
String[] col = new String[] { "foo", "bar" };
MemPageStore memPageStore = new MemPageStore(10);
ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore);
ColumnDescriptor path1 = mt.getColumnDescription(col);
ColumnDescriptor path = path1;
ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path);
int[] rs = { 0, 0, 0, 1, 1, 1, 2, 2, 2 };
int[] ds = { 0, 1, 2, 0, 1, 2, 0, 1, 2 };
for (int i = 0; i < 837; i++) {
int r = rs[i % rs.length];
int d = ds[i % ds.length];
LOG.debug("write i: {}", i);
if (d == 2) {
columnWriter.write((long) i, r, d);
} else {
columnWriter.writeNull(r, d);
}
}
memColumnsStore.flush();
ColumnReader columnReader = getColumnReader(memPageStore, path, mt);
int i = 0;
for (int j = 0; j < columnReader.getTotalValueCount(); j++) {
int r = rs[i % rs.length];
int d = ds[i % ds.length];
LOG.debug("read i: {}", i);
assertEquals("r row " + i, r, columnReader.getCurrentRepetitionLevel());
assertEquals("d row " + i, d, columnReader.getCurrentDefinitionLevel());
if (d == 2) {
assertEquals("data row " + i, (long) i, columnReader.getLong());
}
columnReader.consume();
++i;
}
}
use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class ParquetFileWriter method appendRowGroup.
public void appendRowGroup(SeekableInputStream from, BlockMetaData rowGroup, boolean dropColumns) throws IOException {
startBlock(rowGroup.getRowCount());
Map<String, ColumnChunkMetaData> columnsToCopy = new HashMap<String, ColumnChunkMetaData>();
for (ColumnChunkMetaData chunk : rowGroup.getColumns()) {
columnsToCopy.put(chunk.getPath().toDotString(), chunk);
}
List<ColumnChunkMetaData> columnsInOrder = new ArrayList<ColumnChunkMetaData>();
for (ColumnDescriptor descriptor : schema.getColumns()) {
String path = ColumnPath.get(descriptor.getPath()).toDotString();
ColumnChunkMetaData chunk = columnsToCopy.remove(path);
if (chunk != null) {
columnsInOrder.add(chunk);
} else {
throw new IllegalArgumentException(String.format("Missing column '%s', cannot copy row group: %s", path, rowGroup));
}
}
// complain if some columns would be dropped and that's not okay
if (!dropColumns && !columnsToCopy.isEmpty()) {
throw new IllegalArgumentException(String.format("Columns cannot be copied (missing from target schema): %s", Strings.join(columnsToCopy.keySet(), ", ")));
}
// copy the data for all chunks
long start = -1;
long length = 0;
long blockCompressedSize = 0;
for (int i = 0; i < columnsInOrder.size(); i += 1) {
ColumnChunkMetaData chunk = columnsInOrder.get(i);
// get this chunk's start position in the new file
long newChunkStart = out.getPos() + length;
// add this chunk to be copied with any previous chunks
if (start < 0) {
// no previous chunk included, start at this chunk's starting pos
start = chunk.getStartingPos();
}
length += chunk.getTotalSize();
if ((i + 1) == columnsInOrder.size() || columnsInOrder.get(i + 1).getStartingPos() != (start + length)) {
// not contiguous. do the copy now.
copy(from, out, start, length);
// reset to start at the next column chunk
start = -1;
length = 0;
}
currentBlock.addColumn(ColumnChunkMetaData.get(chunk.getPath(), chunk.getPrimitiveType(), chunk.getCodec(), chunk.getEncodingStats(), chunk.getEncodings(), chunk.getStatistics(), newChunkStart, newChunkStart, chunk.getValueCount(), chunk.getTotalSize(), chunk.getTotalUncompressedSize()));
blockCompressedSize += chunk.getTotalSize();
}
currentBlock.setTotalByteSize(blockCompressedSize);
endBlock();
}
Aggregations