use of org.apache.parquet.VersionParser.ParsedVersion in project parquet-mr by apache.
the class VersionTest method testVersionParser.
@Test
public void testVersionParser() throws Exception {
assertEquals(new ParsedVersion("parquet-mr", "1.6.0", "abcd"), VersionParser.parse("parquet-mr version 1.6.0 (build abcd)"));
assertEquals(new ParsedVersion("parquet-mr", "1.6.22rc99-SNAPSHOT", "abcd"), VersionParser.parse("parquet-mr version 1.6.22rc99-SNAPSHOT (build abcd)"));
try {
VersionParser.parse("unparseable string");
fail("this should throw");
} catch (VersionParseException e) {
//
}
// missing semver
assertEquals(new ParsedVersion("parquet-mr", null, "abcd"), VersionParser.parse("parquet-mr version (build abcd)"));
assertEquals(new ParsedVersion("parquet-mr", null, "abcd"), VersionParser.parse("parquet-mr version (build abcd)"));
// missing build hash
assertEquals(new ParsedVersion("parquet-mr", "1.6.0", null), VersionParser.parse("parquet-mr version 1.6.0 (build )"));
assertEquals(new ParsedVersion("parquet-mr", "1.6.0", null), VersionParser.parse("parquet-mr version 1.6.0 (build)"));
assertEquals(new ParsedVersion("parquet-mr", null, null), VersionParser.parse("parquet-mr version (build)"));
assertEquals(new ParsedVersion("parquet-mr", null, null), VersionParser.parse("parquet-mr version (build )"));
// Missing entire build section
assertEquals(new ParsedVersion("parquet-mr", "1.6.0", null), VersionParser.parse("parquet-mr version 1.6.0"));
assertEquals(new ParsedVersion("parquet-mr", "1.8.0rc4", null), VersionParser.parse("parquet-mr version 1.8.0rc4"));
assertEquals(new ParsedVersion("parquet-mr", "1.8.0rc4-SNAPSHOT", null), VersionParser.parse("parquet-mr version 1.8.0rc4-SNAPSHOT"));
assertEquals(new ParsedVersion("parquet-mr", null, null), VersionParser.parse("parquet-mr version"));
// Various spaces
assertEquals(new ParsedVersion("parquet-mr", "1.6.0", null), VersionParser.parse("parquet-mr version 1.6.0"));
assertEquals(new ParsedVersion("parquet-mr", "1.8.0rc4", null), VersionParser.parse("parquet-mr version 1.8.0rc4"));
assertEquals(new ParsedVersion("parquet-mr", "1.8.0rc4-SNAPSHOT", null), VersionParser.parse("parquet-mr version 1.8.0rc4-SNAPSHOT "));
assertEquals(new ParsedVersion("parquet-mr", null, null), VersionParser.parse("parquet-mr version"));
assertEquals(new ParsedVersion("parquet-mr", "1.6.0", null), VersionParser.parse("parquet-mr version 1.6.0 ( build )"));
assertEquals(new ParsedVersion("parquet-mr", "1.6.0", null), VersionParser.parse("parquet-mr version 1.6.0 ( build)"));
assertEquals(new ParsedVersion("parquet-mr", null, null), VersionParser.parse("parquet-mr version ( build)"));
assertEquals(new ParsedVersion("parquet-mr", null, null), VersionParser.parse("parquet-mr version (build )"));
}
use of org.apache.parquet.VersionParser.ParsedVersion in project parquet-mr by apache.
the class CorruptStatistics method shouldIgnoreStatistics.
/**
* Decides if the statistics from a file created by createdBy (the created_by field from parquet format)
* should be ignored because they are potentially corrupt.
*
* @param createdBy the created-by string from a file footer
* @param columnType the type of the column that this is checking
* @return true if the statistics may be invalid and should be ignored, false otherwise
*/
public static boolean shouldIgnoreStatistics(String createdBy, PrimitiveTypeName columnType) {
if (columnType != PrimitiveTypeName.BINARY && columnType != PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
// the bug only applies to binary columns
return false;
}
if (Strings.isNullOrEmpty(createdBy)) {
// created_by is not populated, which could have been caused by
// parquet-mr during the same time as PARQUET-251, see PARQUET-297
warnOnce("Ignoring statistics because created_by is null or empty! See PARQUET-251 and PARQUET-297");
return true;
}
try {
ParsedVersion version = VersionParser.parse(createdBy);
if (!"parquet-mr".equals(version.application)) {
// assume other applications don't have this bug
return false;
}
if (Strings.isNullOrEmpty(version.version)) {
warnOnce("Ignoring statistics because created_by did not contain a semver (see PARQUET-251): " + createdBy);
return true;
}
SemanticVersion semver = SemanticVersion.parse(version.version);
if (semver.compareTo(PARQUET_251_FIXED_VERSION) < 0 && !(semver.compareTo(CDH_5_PARQUET_251_FIXED_START) >= 0 && semver.compareTo(CDH_5_PARQUET_251_FIXED_END) < 0)) {
warnOnce("Ignoring statistics because this file was created prior to " + PARQUET_251_FIXED_VERSION + ", see PARQUET-251");
return true;
}
// this file was created after the fix
return false;
} catch (RuntimeException e) {
// couldn't parse the created_by field, log what went wrong, don't trust the stats,
// but don't make this fatal.
warnParseErrorOnce(createdBy, e);
return true;
} catch (SemanticVersionParseException e) {
// couldn't parse the created_by field, log what went wrong, don't trust the stats,
// but don't make this fatal.
warnParseErrorOnce(createdBy, e);
return true;
} catch (VersionParseException e) {
// couldn't parse the created_by field, log what went wrong, don't trust the stats,
// but don't make this fatal.
warnParseErrorOnce(createdBy, e);
return true;
}
}
use of org.apache.parquet.VersionParser.ParsedVersion in project parquet-mr by apache.
the class TestCorruptDeltaByteArrays method testColumnReaderImplWithCorruptPage.
@Test
public void testColumnReaderImplWithCorruptPage() throws Exception {
ColumnDescriptor column = new ColumnDescriptor(new String[] { "s" }, PrimitiveType.PrimitiveTypeName.BINARY, 0, 0);
MemPageStore pages = new MemPageStore(0);
PageWriter memWriter = pages.getPageWriter(column);
ParquetProperties parquetProps = ParquetProperties.builder().withDictionaryEncoding(false).build();
// get generic repetition and definition level bytes to use for pages
ValuesWriter rdValues = parquetProps.newDefinitionLevelWriter(column);
for (int i = 0; i < 10; i += 1) {
rdValues.writeInteger(0);
}
// use a byte array backed BytesInput because it is reused
BytesInput rd = BytesInput.from(rdValues.getBytes().toByteArray());
DeltaByteArrayWriter writer = getDeltaByteArrayWriter();
String lastValue = null;
List<String> values = new ArrayList<String>();
for (int i = 0; i < 10; i += 1) {
lastValue = str(i);
writer.writeBytes(Binary.fromString(lastValue));
values.add(lastValue);
}
memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
pages.addRowCount(10);
// sets previous to new byte[0]
writer.reset();
corruptWriter(writer, lastValue);
for (int i = 10; i < 20; i += 1) {
String value = str(i);
writer.writeBytes(Binary.fromString(value));
values.add(value);
}
memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
pages.addRowCount(10);
final List<String> actualValues = new ArrayList<String>();
PrimitiveConverter converter = new PrimitiveConverter() {
@Override
public void addBinary(Binary value) {
actualValues.add(value.toStringUsingUTF8());
}
};
ColumnReaderImpl columnReader = new ColumnReaderImpl(column, pages.getPageReader(column), converter, new ParsedVersion("parquet-mr", "1.6.0", "abcd"));
while (actualValues.size() < columnReader.getTotalValueCount()) {
columnReader.writeCurrentValueToConverter();
columnReader.consume();
}
Assert.assertEquals(values, actualValues);
}
use of org.apache.parquet.VersionParser.ParsedVersion in project parquet-mr by apache.
the class TestCorruptDeltaByteArrays method testEncodingRequiresSequentailRead.
@Test
public void testEncodingRequiresSequentailRead() {
ParsedVersion impala = new ParsedVersion("impala", "1.2.0", "abcd");
assertFalse(CorruptDeltaByteArrays.requiresSequentialReads(impala, Encoding.DELTA_BYTE_ARRAY));
ParsedVersion broken = new ParsedVersion("parquet-mr", "1.8.0-SNAPSHOT", "abcd");
assertTrue(CorruptDeltaByteArrays.requiresSequentialReads(broken, Encoding.DELTA_BYTE_ARRAY));
ParsedVersion fixed = new ParsedVersion("parquet-mr", "1.8.0", "abcd");
assertFalse(CorruptDeltaByteArrays.requiresSequentialReads(fixed, Encoding.DELTA_BYTE_ARRAY));
}
use of org.apache.parquet.VersionParser.ParsedVersion in project parquet-mr by apache.
the class VersionTest method testFullVersion.
@Test
public void testFullVersion() throws Exception {
ParsedVersion version = VersionParser.parse(Version.FULL_VERSION);
assertVersionValid(version.version);
assertEquals(Version.VERSION_NUMBER, version.version);
assertEquals("parquet-mr", version.application);
}
Aggregations