use of org.apache.hudi.utilities.HDFSParquetImporter in project hudi by apache.
the class TestHDFSParquetImporter method testImportWithRetries.
/**
* Test successful data import with retries.
*/
@Test
public void testImportWithRetries() throws Exception {
// Create schema file.
String schemaFile = new Path(basePath, "file.schema").toString();
HDFSParquetImporter.Config cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), "testTable", "COPY_ON_WRITE", "_row_key", "timestamp", 1, schemaFile);
AtomicInteger retry = new AtomicInteger(3);
AtomicInteger fileCreated = new AtomicInteger(0);
HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg) {
@Override
protected int dataImport(JavaSparkContext jsc) throws IOException {
int ret = super.dataImport(jsc);
if (retry.decrementAndGet() == 0) {
fileCreated.incrementAndGet();
createSchemaFile(schemaFile);
}
return ret;
}
};
// Schema file is not created so this operation should fail.
assertEquals(0, dataImporter.dataImport(jsc(), retry.get()));
assertEquals(-1, retry.get());
assertEquals(1, fileCreated.get());
// Check if
// 1. .commit file is present
// 2. number of records in each partition == 24
// 3. total number of partitions == 4;
boolean isCommitFilePresent = false;
Map<String, Long> recordCounts = new HashMap<String, Long>();
RemoteIterator<LocatedFileStatus> hoodieFiles = dfs().listFiles(hoodieFolder, true);
while (hoodieFiles.hasNext()) {
LocatedFileStatus f = hoodieFiles.next();
isCommitFilePresent = isCommitFilePresent || f.getPath().toString().endsWith(HoodieTimeline.COMMIT_EXTENSION);
if (f.getPath().toString().endsWith("parquet")) {
String partitionPath = f.getPath().getParent().toString();
long count = sqlContext().read().parquet(f.getPath().toString()).count();
if (!recordCounts.containsKey(partitionPath)) {
recordCounts.put(partitionPath, 0L);
}
recordCounts.put(partitionPath, recordCounts.get(partitionPath) + count);
}
}
assertTrue(isCommitFilePresent, "commit file is missing");
assertEquals(4, recordCounts.size(), "partition is missing");
for (Entry<String, Long> e : recordCounts.entrySet()) {
assertEquals(24, e.getValue().longValue(), "missing records");
}
}
use of org.apache.hudi.utilities.HDFSParquetImporter in project hudi by apache.
the class TestHDFSParquetImporter method testRowAndPartitionKey.
/**
* Test for missing rowKey and partitionKey.
*/
@Test
public void testRowAndPartitionKey() throws Exception {
// Create schema file.
Path schemaFile = new Path(basePath.toString(), "missingFile.schema");
createSchemaFile(schemaFile.toString());
HDFSParquetImporter dataImporter;
HDFSParquetImporter.Config cfg;
// Check for invalid row key.
cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), "testTable", "COPY_ON_WRITE", "invalidRowKey", "timestamp", 1, schemaFile.toString());
dataImporter = new HDFSParquetImporter(cfg);
assertEquals(-1, dataImporter.dataImport(jsc(), 0));
// Check for invalid partition key.
cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), "testTable", "COPY_ON_WRITE", "_row_key", "invalidTimeStamp", 1, schemaFile.toString());
dataImporter = new HDFSParquetImporter(cfg);
assertEquals(-1, dataImporter.dataImport(jsc(), 0));
}
Aggregations