Search in sources :

Example 6 with HDFSParquetImporter

use of org.apache.hudi.utilities.HDFSParquetImporter in project hudi by apache.

the class TestHDFSParquetImporter method testImportWithRetries.

/**
 * Test successful data import with retries.
 */
@Test
public void testImportWithRetries() throws Exception {
    // Create schema file.
    String schemaFile = new Path(basePath, "file.schema").toString();
    HDFSParquetImporter.Config cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), "testTable", "COPY_ON_WRITE", "_row_key", "timestamp", 1, schemaFile);
    AtomicInteger retry = new AtomicInteger(3);
    AtomicInteger fileCreated = new AtomicInteger(0);
    HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg) {

        @Override
        protected int dataImport(JavaSparkContext jsc) throws IOException {
            int ret = super.dataImport(jsc);
            if (retry.decrementAndGet() == 0) {
                fileCreated.incrementAndGet();
                createSchemaFile(schemaFile);
            }
            return ret;
        }
    };
    // Schema file is not created so this operation should fail.
    assertEquals(0, dataImporter.dataImport(jsc(), retry.get()));
    assertEquals(-1, retry.get());
    assertEquals(1, fileCreated.get());
    // Check if
    // 1. .commit file is present
    // 2. number of records in each partition == 24
    // 3. total number of partitions == 4;
    boolean isCommitFilePresent = false;
    Map<String, Long> recordCounts = new HashMap<String, Long>();
    RemoteIterator<LocatedFileStatus> hoodieFiles = dfs().listFiles(hoodieFolder, true);
    while (hoodieFiles.hasNext()) {
        LocatedFileStatus f = hoodieFiles.next();
        isCommitFilePresent = isCommitFilePresent || f.getPath().toString().endsWith(HoodieTimeline.COMMIT_EXTENSION);
        if (f.getPath().toString().endsWith("parquet")) {
            String partitionPath = f.getPath().getParent().toString();
            long count = sqlContext().read().parquet(f.getPath().toString()).count();
            if (!recordCounts.containsKey(partitionPath)) {
                recordCounts.put(partitionPath, 0L);
            }
            recordCounts.put(partitionPath, recordCounts.get(partitionPath) + count);
        }
    }
    assertTrue(isCommitFilePresent, "commit file is missing");
    assertEquals(4, recordCounts.size(), "partition is missing");
    for (Entry<String, Long> e : recordCounts.entrySet()) {
        assertEquals(24, e.getValue().longValue(), "missing records");
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) HDFSParquetImporter(org.apache.hudi.utilities.HDFSParquetImporter) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Test(org.junit.jupiter.api.Test)

Example 7 with HDFSParquetImporter

use of org.apache.hudi.utilities.HDFSParquetImporter in project hudi by apache.

the class TestHDFSParquetImporter method testRowAndPartitionKey.

/**
 * Test for missing rowKey and partitionKey.
 */
@Test
public void testRowAndPartitionKey() throws Exception {
    // Create schema file.
    Path schemaFile = new Path(basePath.toString(), "missingFile.schema");
    createSchemaFile(schemaFile.toString());
    HDFSParquetImporter dataImporter;
    HDFSParquetImporter.Config cfg;
    // Check for invalid row key.
    cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), "testTable", "COPY_ON_WRITE", "invalidRowKey", "timestamp", 1, schemaFile.toString());
    dataImporter = new HDFSParquetImporter(cfg);
    assertEquals(-1, dataImporter.dataImport(jsc(), 0));
    // Check for invalid partition key.
    cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), "testTable", "COPY_ON_WRITE", "_row_key", "invalidTimeStamp", 1, schemaFile.toString());
    dataImporter = new HDFSParquetImporter(cfg);
    assertEquals(-1, dataImporter.dataImport(jsc(), 0));
}
Also used : Path(org.apache.hadoop.fs.Path) HDFSParquetImporter(org.apache.hudi.utilities.HDFSParquetImporter) Test(org.junit.jupiter.api.Test)

Aggregations

HDFSParquetImporter (org.apache.hudi.utilities.HDFSParquetImporter)7 Path (org.apache.hadoop.fs.Path)6 Test (org.junit.jupiter.api.Test)5 HashMap (java.util.HashMap)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 LocatedFileStatus (org.apache.hadoop.fs.LocatedFileStatus)2 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)2 IOException (java.io.IOException)1 Serializable (java.io.Serializable)1 ParseException (java.text.ParseException)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 Map (java.util.Map)1 Entry (java.util.Map.Entry)1 Objects (java.util.Objects)1 TimeUnit (java.util.concurrent.TimeUnit)1 Collectors (java.util.stream.Collectors)1 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)1 RemoteIterator (org.apache.hadoop.fs.RemoteIterator)1