use of org.apache.hudi.utilities.HDFSParquetImporter in project hudi by apache.
the class ITTestHDFSParquetImportCommand method testConvertWithUpsert.
/**
* Test case for 'hdfsparquetimport' with upsert.
*/
@Test
public void testConvertWithUpsert() throws IOException, ParseException {
Path upsertFolder = new Path(basePath, "testUpsertSrc");
List<GenericRecord> upsertData = importer.createUpsertRecords(upsertFolder);
// first insert records
HDFSParquetImporter.Config cfg = importer.getHDFSParquetImporterConfig(sourcePath.toString(), tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), "_row_key", "timestamp", 1, schemaFile);
HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg);
dataImporter.dataImport(jsc, 0);
// Load meta data
new TableCommand().connect(targetPath.toString(), TimelineLayoutVersion.VERSION_1, false, 2000, 300000, 7);
metaClient = HoodieCLI.getTableMetaClient();
// check if insert instant exist
assertEquals(1, metaClient.getActiveTimeline().getCommitsTimeline().countInstants(), "Should only 1 commit.");
String command = String.format("hdfsparquetimport --srcPath %s --targetPath %s --tableName %s " + "--tableType %s --rowKeyField %s" + " --partitionPathField %s --parallelism %s " + "--schemaFilePath %s --format %s --sparkMemory %s --retry %s --sparkMaster %s --upsert %s", upsertFolder.toString(), targetPath.toString(), tableName, HoodieTableType.COPY_ON_WRITE.name(), "_row_key", "timestamp", "1", schemaFile, "parquet", "2G", "1", "local", "true");
CommandResult cr = getShell().executeCommand(command);
assertAll("Command run success", () -> assertTrue(cr.isSuccess()), () -> assertEquals("Table imported to hoodie format", cr.getResult().toString()));
// reload meta client
metaClient = HoodieTableMetaClient.reload(metaClient);
assertEquals(2, metaClient.getActiveTimeline().getCommitsTimeline().countInstants(), "Should have 2 commit.");
// construct result, remove top 10 and add upsert data.
List<GenericRecord> expectData = insertData.subList(11, 96);
expectData.addAll(upsertData);
verifyResultData(expectData);
}
use of org.apache.hudi.utilities.HDFSParquetImporter in project hudi by apache.
the class SparkMain method dataLoad.
private static int dataLoad(JavaSparkContext jsc, String command, String srcPath, String targetPath, String tableName, String tableType, String rowKey, String partitionKey, int parallelism, String schemaFile, int retry, String propsFilePath, List<String> configs) {
Config cfg = new Config();
cfg.command = command;
cfg.srcPath = srcPath;
cfg.targetPath = targetPath;
cfg.tableName = tableName;
cfg.tableType = tableType;
cfg.rowKey = rowKey;
cfg.partitionKey = partitionKey;
cfg.parallelism = parallelism;
cfg.schemaFile = schemaFile;
cfg.propsFilePath = propsFilePath;
cfg.configs = configs;
return new HDFSParquetImporter(cfg).dataImport(jsc, retry);
}
use of org.apache.hudi.utilities.HDFSParquetImporter in project hudi by apache.
the class TestHDFSParquetImporter method testImportWithUpsert.
/**
* Test upsert data and verify data consistency.
*/
@Test
public void testImportWithUpsert() throws IOException, ParseException {
insert(jsc());
// Create schema file.
String schemaFile = new Path(basePath, "file.schema").toString();
Path upsertFolder = new Path(basePath, "testUpsertSrc");
List<GenericRecord> upsertData = createUpsertRecords(upsertFolder);
HDFSParquetImporter.Config cfg = getHDFSParquetImporterConfig(upsertFolder.toString(), hoodieFolder.toString(), "testTable", "COPY_ON_WRITE", "_row_key", "timestamp", 1, schemaFile);
cfg.command = "upsert";
HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg);
dataImporter.dataImport(jsc(), 0);
// construct result, remove top 10 and add upsert data.
List<GenericRecord> expectData = insertData.subList(11, 96);
expectData.addAll(upsertData);
// read latest data
Dataset<Row> ds = HoodieClientTestUtils.read(jsc(), basePath + "/testTarget", sqlContext(), dfs(), basePath + "/testTarget/*/*/*/*");
List<Row> readData = ds.select("timestamp", "_row_key", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon").collectAsList();
List<HoodieTripModel> result = readData.stream().map(row -> new HoodieTripModel(row.getLong(0), row.getString(1), row.getString(2), row.getString(3), row.getDouble(4), row.getDouble(5), row.getDouble(6), row.getDouble(7))).collect(Collectors.toList());
// get expected result.
List<HoodieTripModel> expected = expectData.stream().map(g -> new HoodieTripModel(Long.parseLong(g.get("timestamp").toString()), g.get("_row_key").toString(), g.get("rider").toString(), g.get("driver").toString(), Double.parseDouble(g.get("begin_lat").toString()), Double.parseDouble(g.get("begin_lon").toString()), Double.parseDouble(g.get("end_lat").toString()), Double.parseDouble(g.get("end_lon").toString()))).collect(Collectors.toList());
assertTrue(result.containsAll(expected) && expected.containsAll(result) && result.size() == expected.size());
}
use of org.apache.hudi.utilities.HDFSParquetImporter in project hudi by apache.
the class TestHDFSParquetImporter method testSchemaFile.
/**
* Tests for scheme file. 1. File is missing. 2. File has invalid data.
*/
@Test
public void testSchemaFile() throws Exception {
// Hoodie root folder
Path hoodieFolder = new Path(basePath, "testTarget");
Path srcFolder = new Path(basePath.toString(), "srcTest");
Path schemaFile = new Path(basePath.toString(), "missingFile.schema");
HDFSParquetImporter.Config cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), "testTable", "COPY_ON_WRITE", "_row_key", "timestamp", 1, schemaFile.toString());
HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg);
// Should fail - return : -1.
assertEquals(-1, dataImporter.dataImport(jsc(), 0));
dfs().create(schemaFile).write("Random invalid schema data".getBytes());
// Should fail - return : -1.
assertEquals(-1, dataImporter.dataImport(jsc(), 0));
}
use of org.apache.hudi.utilities.HDFSParquetImporter in project hudi by apache.
the class TestHDFSParquetImporter method insert.
private void insert(JavaSparkContext jsc) throws IOException {
// Create schema file.
String schemaFile = new Path(basePath, "file.schema").toString();
createSchemaFile(schemaFile);
HDFSParquetImporter.Config cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), "testTable", "COPY_ON_WRITE", "_row_key", "timestamp", 1, schemaFile);
HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg);
dataImporter.dataImport(jsc, 0);
}
Aggregations