Search in sources :

Example 1 with HDFSParquetImporter

use of org.apache.hudi.utilities.HDFSParquetImporter in project hudi by apache.

the class ITTestHDFSParquetImportCommand method testConvertWithUpsert.

/**
 * Test case for 'hdfsparquetimport' with upsert.
 */
@Test
public void testConvertWithUpsert() throws IOException, ParseException {
    Path upsertFolder = new Path(basePath, "testUpsertSrc");
    List<GenericRecord> upsertData = importer.createUpsertRecords(upsertFolder);
    // first insert records
    HDFSParquetImporter.Config cfg = importer.getHDFSParquetImporterConfig(sourcePath.toString(), tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), "_row_key", "timestamp", 1, schemaFile);
    HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg);
    dataImporter.dataImport(jsc, 0);
    // Load meta data
    new TableCommand().connect(targetPath.toString(), TimelineLayoutVersion.VERSION_1, false, 2000, 300000, 7);
    metaClient = HoodieCLI.getTableMetaClient();
    // check if insert instant exist
    assertEquals(1, metaClient.getActiveTimeline().getCommitsTimeline().countInstants(), "Should only 1 commit.");
    String command = String.format("hdfsparquetimport --srcPath %s --targetPath %s --tableName %s " + "--tableType %s --rowKeyField %s" + " --partitionPathField %s --parallelism %s " + "--schemaFilePath %s --format %s --sparkMemory %s --retry %s --sparkMaster %s --upsert %s", upsertFolder.toString(), targetPath.toString(), tableName, HoodieTableType.COPY_ON_WRITE.name(), "_row_key", "timestamp", "1", schemaFile, "parquet", "2G", "1", "local", "true");
    CommandResult cr = getShell().executeCommand(command);
    assertAll("Command run success", () -> assertTrue(cr.isSuccess()), () -> assertEquals("Table imported to hoodie format", cr.getResult().toString()));
    // reload meta client
    metaClient = HoodieTableMetaClient.reload(metaClient);
    assertEquals(2, metaClient.getActiveTimeline().getCommitsTimeline().countInstants(), "Should have 2 commit.");
    // construct result, remove top 10 and add upsert data.
    List<GenericRecord> expectData = insertData.subList(11, 96);
    expectData.addAll(upsertData);
    verifyResultData(expectData);
}
Also used : Path(org.apache.hadoop.fs.Path) TestHDFSParquetImporter(org.apache.hudi.utilities.functional.TestHDFSParquetImporter) HDFSParquetImporter(org.apache.hudi.utilities.HDFSParquetImporter) GenericRecord(org.apache.avro.generic.GenericRecord) TableCommand(org.apache.hudi.cli.commands.TableCommand) CommandResult(org.springframework.shell.core.CommandResult) AbstractShellIntegrationTest(org.apache.hudi.cli.testutils.AbstractShellIntegrationTest) Test(org.junit.jupiter.api.Test)

Example 2 with HDFSParquetImporter

use of org.apache.hudi.utilities.HDFSParquetImporter in project hudi by apache.

the class SparkMain method dataLoad.

private static int dataLoad(JavaSparkContext jsc, String command, String srcPath, String targetPath, String tableName, String tableType, String rowKey, String partitionKey, int parallelism, String schemaFile, int retry, String propsFilePath, List<String> configs) {
    Config cfg = new Config();
    cfg.command = command;
    cfg.srcPath = srcPath;
    cfg.targetPath = targetPath;
    cfg.tableName = tableName;
    cfg.tableType = tableType;
    cfg.rowKey = rowKey;
    cfg.partitionKey = partitionKey;
    cfg.parallelism = parallelism;
    cfg.schemaFile = schemaFile;
    cfg.propsFilePath = propsFilePath;
    cfg.configs = configs;
    return new HDFSParquetImporter(cfg).dataImport(jsc, retry);
}
Also used : HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) Config(org.apache.hudi.utilities.HDFSParquetImporter.Config) HoodieBootstrapConfig(org.apache.hudi.config.HoodieBootstrapConfig) HDFSParquetImporter(org.apache.hudi.utilities.HDFSParquetImporter)

Example 3 with HDFSParquetImporter

use of org.apache.hudi.utilities.HDFSParquetImporter in project hudi by apache.

the class TestHDFSParquetImporter method testImportWithUpsert.

/**
 * Test upsert data and verify data consistency.
 */
@Test
public void testImportWithUpsert() throws IOException, ParseException {
    insert(jsc());
    // Create schema file.
    String schemaFile = new Path(basePath, "file.schema").toString();
    Path upsertFolder = new Path(basePath, "testUpsertSrc");
    List<GenericRecord> upsertData = createUpsertRecords(upsertFolder);
    HDFSParquetImporter.Config cfg = getHDFSParquetImporterConfig(upsertFolder.toString(), hoodieFolder.toString(), "testTable", "COPY_ON_WRITE", "_row_key", "timestamp", 1, schemaFile);
    cfg.command = "upsert";
    HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg);
    dataImporter.dataImport(jsc(), 0);
    // construct result, remove top 10 and add upsert data.
    List<GenericRecord> expectData = insertData.subList(11, 96);
    expectData.addAll(upsertData);
    // read latest data
    Dataset<Row> ds = HoodieClientTestUtils.read(jsc(), basePath + "/testTarget", sqlContext(), dfs(), basePath + "/testTarget/*/*/*/*");
    List<Row> readData = ds.select("timestamp", "_row_key", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon").collectAsList();
    List<HoodieTripModel> result = readData.stream().map(row -> new HoodieTripModel(row.getLong(0), row.getString(1), row.getString(2), row.getString(3), row.getDouble(4), row.getDouble(5), row.getDouble(6), row.getDouble(7))).collect(Collectors.toList());
    // get expected result.
    List<HoodieTripModel> expected = expectData.stream().map(g -> new HoodieTripModel(Long.parseLong(g.get("timestamp").toString()), g.get("_row_key").toString(), g.get("rider").toString(), g.get("driver").toString(), Double.parseDouble(g.get("begin_lat").toString()), Double.parseDouble(g.get("begin_lon").toString()), Double.parseDouble(g.get("end_lat").toString()), Double.parseDouble(g.get("end_lon").toString()))).collect(Collectors.toList());
    assertTrue(result.containsAll(expected) && expected.containsAll(result) && result.size() == expected.size());
}
Also used : Path(org.apache.hadoop.fs.Path) BeforeEach(org.junit.jupiter.api.BeforeEach) Dataset(org.apache.spark.sql.Dataset) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) HashMap(java.util.HashMap) Disabled(org.junit.jupiter.api.Disabled) ArrayList(java.util.ArrayList) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Tag(org.junit.jupiter.api.Tag) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) ParseException(java.text.ParseException) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) GenericRecord(org.apache.avro.generic.GenericRecord) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) Collectors(java.util.stream.Collectors) Serializable(java.io.Serializable) Test(org.junit.jupiter.api.Test) Objects(java.util.Objects) TimeUnit(java.util.concurrent.TimeUnit) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) AfterEach(org.junit.jupiter.api.AfterEach) List(java.util.List) FunctionalTestHarness(org.apache.hudi.testutils.FunctionalTestHarness) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) AvroParquetWriter(org.apache.parquet.avro.AvroParquetWriter) Entry(java.util.Map.Entry) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) HoodieClientTestUtils(org.apache.hudi.testutils.HoodieClientTestUtils) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) HDFSParquetImporter(org.apache.hudi.utilities.HDFSParquetImporter) HDFSParquetImporter(org.apache.hudi.utilities.HDFSParquetImporter) Row(org.apache.spark.sql.Row) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.jupiter.api.Test)

Example 4 with HDFSParquetImporter

use of org.apache.hudi.utilities.HDFSParquetImporter in project hudi by apache.

the class TestHDFSParquetImporter method testSchemaFile.

/**
 * Tests for scheme file. 1. File is missing. 2. File has invalid data.
 */
@Test
public void testSchemaFile() throws Exception {
    // Hoodie root folder
    Path hoodieFolder = new Path(basePath, "testTarget");
    Path srcFolder = new Path(basePath.toString(), "srcTest");
    Path schemaFile = new Path(basePath.toString(), "missingFile.schema");
    HDFSParquetImporter.Config cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), "testTable", "COPY_ON_WRITE", "_row_key", "timestamp", 1, schemaFile.toString());
    HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg);
    // Should fail - return : -1.
    assertEquals(-1, dataImporter.dataImport(jsc(), 0));
    dfs().create(schemaFile).write("Random invalid schema data".getBytes());
    // Should fail - return : -1.
    assertEquals(-1, dataImporter.dataImport(jsc(), 0));
}
Also used : Path(org.apache.hadoop.fs.Path) HDFSParquetImporter(org.apache.hudi.utilities.HDFSParquetImporter) Test(org.junit.jupiter.api.Test)

Example 5 with HDFSParquetImporter

use of org.apache.hudi.utilities.HDFSParquetImporter in project hudi by apache.

the class TestHDFSParquetImporter method insert.

private void insert(JavaSparkContext jsc) throws IOException {
    // Create schema file.
    String schemaFile = new Path(basePath, "file.schema").toString();
    createSchemaFile(schemaFile);
    HDFSParquetImporter.Config cfg = getHDFSParquetImporterConfig(srcFolder.toString(), hoodieFolder.toString(), "testTable", "COPY_ON_WRITE", "_row_key", "timestamp", 1, schemaFile);
    HDFSParquetImporter dataImporter = new HDFSParquetImporter(cfg);
    dataImporter.dataImport(jsc, 0);
}
Also used : Path(org.apache.hadoop.fs.Path) HDFSParquetImporter(org.apache.hudi.utilities.HDFSParquetImporter)

Aggregations

HDFSParquetImporter (org.apache.hudi.utilities.HDFSParquetImporter)7 Path (org.apache.hadoop.fs.Path)6 Test (org.junit.jupiter.api.Test)5 HashMap (java.util.HashMap)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 LocatedFileStatus (org.apache.hadoop.fs.LocatedFileStatus)2 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)2 IOException (java.io.IOException)1 Serializable (java.io.Serializable)1 ParseException (java.text.ParseException)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 Map (java.util.Map)1 Entry (java.util.Map.Entry)1 Objects (java.util.Objects)1 TimeUnit (java.util.concurrent.TimeUnit)1 Collectors (java.util.stream.Collectors)1 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)1 RemoteIterator (org.apache.hadoop.fs.RemoteIterator)1