Search in sources :

Example 1 with ParsedVersion

use of org.apache.parquet.VersionParser.ParsedVersion in project parquet-mr by apache.

the class VersionTest method testVersionParser.

@Test
public void testVersionParser() throws Exception {
    assertEquals(new ParsedVersion("parquet-mr", "1.6.0", "abcd"), VersionParser.parse("parquet-mr version 1.6.0 (build abcd)"));
    assertEquals(new ParsedVersion("parquet-mr", "1.6.22rc99-SNAPSHOT", "abcd"), VersionParser.parse("parquet-mr version 1.6.22rc99-SNAPSHOT (build abcd)"));
    try {
        VersionParser.parse("unparseable string");
        fail("this should throw");
    } catch (VersionParseException e) {
    // 
    }
    // missing semver
    assertEquals(new ParsedVersion("parquet-mr", null, "abcd"), VersionParser.parse("parquet-mr version (build abcd)"));
    assertEquals(new ParsedVersion("parquet-mr", null, "abcd"), VersionParser.parse("parquet-mr version  (build abcd)"));
    // missing build hash
    assertEquals(new ParsedVersion("parquet-mr", "1.6.0", null), VersionParser.parse("parquet-mr version 1.6.0 (build )"));
    assertEquals(new ParsedVersion("parquet-mr", "1.6.0", null), VersionParser.parse("parquet-mr version 1.6.0 (build)"));
    assertEquals(new ParsedVersion("parquet-mr", null, null), VersionParser.parse("parquet-mr version (build)"));
    assertEquals(new ParsedVersion("parquet-mr", null, null), VersionParser.parse("parquet-mr version (build )"));
    // Missing entire build section
    assertEquals(new ParsedVersion("parquet-mr", "1.6.0", null), VersionParser.parse("parquet-mr version 1.6.0"));
    assertEquals(new ParsedVersion("parquet-mr", "1.8.0rc4", null), VersionParser.parse("parquet-mr version 1.8.0rc4"));
    assertEquals(new ParsedVersion("parquet-mr", "1.8.0rc4-SNAPSHOT", null), VersionParser.parse("parquet-mr version 1.8.0rc4-SNAPSHOT"));
    assertEquals(new ParsedVersion("parquet-mr", null, null), VersionParser.parse("parquet-mr version"));
    // Various spaces
    assertEquals(new ParsedVersion("parquet-mr", "1.6.0", null), VersionParser.parse("parquet-mr     version    1.6.0"));
    assertEquals(new ParsedVersion("parquet-mr", "1.8.0rc4", null), VersionParser.parse("parquet-mr     version    1.8.0rc4"));
    assertEquals(new ParsedVersion("parquet-mr", "1.8.0rc4-SNAPSHOT", null), VersionParser.parse("parquet-mr      version    1.8.0rc4-SNAPSHOT  "));
    assertEquals(new ParsedVersion("parquet-mr", null, null), VersionParser.parse("parquet-mr      version"));
    assertEquals(new ParsedVersion("parquet-mr", "1.6.0", null), VersionParser.parse("parquet-mr version 1.6.0 (  build )"));
    assertEquals(new ParsedVersion("parquet-mr", "1.6.0", null), VersionParser.parse("parquet-mr     version 1.6.0 (    build)"));
    assertEquals(new ParsedVersion("parquet-mr", null, null), VersionParser.parse("parquet-mr     version (    build)"));
    assertEquals(new ParsedVersion("parquet-mr", null, null), VersionParser.parse("parquet-mr    version    (build    )"));
}
Also used : VersionParseException(org.apache.parquet.VersionParser.VersionParseException) ParsedVersion(org.apache.parquet.VersionParser.ParsedVersion) Test(org.junit.Test)

Example 2 with ParsedVersion

use of org.apache.parquet.VersionParser.ParsedVersion in project parquet-mr by apache.

the class CorruptStatistics method shouldIgnoreStatistics.

/**
 * Decides if the statistics from a file created by createdBy (the created_by field from parquet format)
 * should be ignored because they are potentially corrupt.
 *
 * @param createdBy the created-by string from a file footer
 * @param columnType the type of the column that this is checking
 * @return true if the statistics may be invalid and should be ignored, false otherwise
 */
public static boolean shouldIgnoreStatistics(String createdBy, PrimitiveTypeName columnType) {
    if (columnType != PrimitiveTypeName.BINARY && columnType != PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
        // the bug only applies to binary columns
        return false;
    }
    if (Strings.isNullOrEmpty(createdBy)) {
        // created_by is not populated, which could have been caused by
        // parquet-mr during the same time as PARQUET-251, see PARQUET-297
        warnOnce("Ignoring statistics because created_by is null or empty! See PARQUET-251 and PARQUET-297");
        return true;
    }
    try {
        ParsedVersion version = VersionParser.parse(createdBy);
        if (!"parquet-mr".equals(version.application)) {
            // assume other applications don't have this bug
            return false;
        }
        if (Strings.isNullOrEmpty(version.version)) {
            warnOnce("Ignoring statistics because created_by did not contain a semver (see PARQUET-251): " + createdBy);
            return true;
        }
        SemanticVersion semver = SemanticVersion.parse(version.version);
        if (semver.compareTo(PARQUET_251_FIXED_VERSION) < 0 && !(semver.compareTo(CDH_5_PARQUET_251_FIXED_START) >= 0 && semver.compareTo(CDH_5_PARQUET_251_FIXED_END) < 0)) {
            warnOnce("Ignoring statistics because this file was created prior to " + PARQUET_251_FIXED_VERSION + ", see PARQUET-251");
            return true;
        }
        // this file was created after the fix
        return false;
    } catch (RuntimeException e) {
        // couldn't parse the created_by field, log what went wrong, don't trust the stats,
        // but don't make this fatal.
        warnParseErrorOnce(createdBy, e);
        return true;
    } catch (SemanticVersionParseException e) {
        // couldn't parse the created_by field, log what went wrong, don't trust the stats,
        // but don't make this fatal.
        warnParseErrorOnce(createdBy, e);
        return true;
    } catch (VersionParseException e) {
        // couldn't parse the created_by field, log what went wrong, don't trust the stats,
        // but don't make this fatal.
        warnParseErrorOnce(createdBy, e);
        return true;
    }
}
Also used : SemanticVersionParseException(org.apache.parquet.SemanticVersion.SemanticVersionParseException) SemanticVersionParseException(org.apache.parquet.SemanticVersion.SemanticVersionParseException) VersionParseException(org.apache.parquet.VersionParser.VersionParseException) ParsedVersion(org.apache.parquet.VersionParser.ParsedVersion)

Example 3 with ParsedVersion

use of org.apache.parquet.VersionParser.ParsedVersion in project parquet-mr by apache.

the class TestCorruptDeltaByteArrays method testColumnReaderImplWithCorruptPage.

@Test
public void testColumnReaderImplWithCorruptPage() throws Exception {
    ColumnDescriptor column = new ColumnDescriptor(new String[] { "s" }, PrimitiveType.PrimitiveTypeName.BINARY, 0, 0);
    MemPageStore pages = new MemPageStore(0);
    PageWriter memWriter = pages.getPageWriter(column);
    ParquetProperties parquetProps = ParquetProperties.builder().withDictionaryEncoding(false).build();
    // get generic repetition and definition level bytes to use for pages
    ValuesWriter rdValues = parquetProps.newDefinitionLevelWriter(column);
    for (int i = 0; i < 10; i += 1) {
        rdValues.writeInteger(0);
    }
    // use a byte array backed BytesInput because it is reused
    BytesInput rd = BytesInput.from(rdValues.getBytes().toByteArray());
    DeltaByteArrayWriter writer = getDeltaByteArrayWriter();
    String lastValue = null;
    List<String> values = new ArrayList<String>();
    for (int i = 0; i < 10; i += 1) {
        lastValue = str(i);
        writer.writeBytes(Binary.fromString(lastValue));
        values.add(lastValue);
    }
    memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
    new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
    pages.addRowCount(10);
    // sets previous to new byte[0]
    writer.reset();
    corruptWriter(writer, lastValue);
    for (int i = 10; i < 20; i += 1) {
        String value = str(i);
        writer.writeBytes(Binary.fromString(value));
        values.add(value);
    }
    memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
    new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
    pages.addRowCount(10);
    final List<String> actualValues = new ArrayList<String>();
    PrimitiveConverter converter = new PrimitiveConverter() {

        @Override
        public void addBinary(Binary value) {
            actualValues.add(value.toStringUsingUTF8());
        }
    };
    ColumnReaderImpl columnReader = new ColumnReaderImpl(column, pages.getPageReader(column), converter, new ParsedVersion("parquet-mr", "1.6.0", "abcd"));
    while (actualValues.size() < columnReader.getTotalValueCount()) {
        columnReader.writeCurrentValueToConverter();
        columnReader.consume();
    }
    Assert.assertEquals(values, actualValues);
}
Also used : BytesInput(org.apache.parquet.bytes.BytesInput) DeltaByteArrayWriter(org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList) ParquetProperties(org.apache.parquet.column.ParquetProperties) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) PrimitiveConverter(org.apache.parquet.io.api.PrimitiveConverter) MemPageStore(org.apache.parquet.column.page.mem.MemPageStore) Binary(org.apache.parquet.io.api.Binary) ValuesWriter(org.apache.parquet.column.values.ValuesWriter) ParsedVersion(org.apache.parquet.VersionParser.ParsedVersion) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Example 4 with ParsedVersion

use of org.apache.parquet.VersionParser.ParsedVersion in project parquet-mr by apache.

the class TestCorruptDeltaByteArrays method testEncodingRequiresSequentailRead.

@Test
public void testEncodingRequiresSequentailRead() {
    ParsedVersion impala = new ParsedVersion("impala", "1.2.0", "abcd");
    assertFalse(CorruptDeltaByteArrays.requiresSequentialReads(impala, Encoding.DELTA_BYTE_ARRAY));
    ParsedVersion broken = new ParsedVersion("parquet-mr", "1.8.0-SNAPSHOT", "abcd");
    assertTrue(CorruptDeltaByteArrays.requiresSequentialReads(broken, Encoding.DELTA_BYTE_ARRAY));
    ParsedVersion fixed = new ParsedVersion("parquet-mr", "1.8.0", "abcd");
    assertFalse(CorruptDeltaByteArrays.requiresSequentialReads(fixed, Encoding.DELTA_BYTE_ARRAY));
}
Also used : ParsedVersion(org.apache.parquet.VersionParser.ParsedVersion) Test(org.junit.Test)

Example 5 with ParsedVersion

use of org.apache.parquet.VersionParser.ParsedVersion in project parquet-mr by apache.

the class VersionTest method testFullVersion.

@Test
public void testFullVersion() throws Exception {
    ParsedVersion version = VersionParser.parse(Version.FULL_VERSION);
    assertVersionValid(version.version);
    assertEquals(Version.VERSION_NUMBER, version.version);
    assertEquals("parquet-mr", version.application);
}
Also used : ParsedVersion(org.apache.parquet.VersionParser.ParsedVersion) Test(org.junit.Test)

Aggregations

ParsedVersion (org.apache.parquet.VersionParser.ParsedVersion)5 Test (org.junit.Test)4 VersionParseException (org.apache.parquet.VersionParser.VersionParseException)2 ArrayList (java.util.ArrayList)1 SemanticVersionParseException (org.apache.parquet.SemanticVersion.SemanticVersionParseException)1 BytesInput (org.apache.parquet.bytes.BytesInput)1 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)1 ParquetProperties (org.apache.parquet.column.ParquetProperties)1 PageWriter (org.apache.parquet.column.page.PageWriter)1 MemPageStore (org.apache.parquet.column.page.mem.MemPageStore)1 BinaryStatistics (org.apache.parquet.column.statistics.BinaryStatistics)1 ValuesWriter (org.apache.parquet.column.values.ValuesWriter)1 DeltaByteArrayWriter (org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter)1 Binary (org.apache.parquet.io.api.Binary)1 PrimitiveConverter (org.apache.parquet.io.api.PrimitiveConverter)1