Search in sources :

Example 1 with SortGroupSplit

use of uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.SortGroupSplit in project Gaffer by gchq.

the class SortGroupSplitTest method sortTest.

@Test
public void sortTest(@TempDir java.nio.file.Path tempDir) throws IOException {
    // Given
    final FileSystem fs = FileSystem.get(new Configuration());
    final SparkSession sparkSession = SparkSessionProvider.getSparkSession();
    final String inputDir = Files.createDirectories(tempDir.resolve("input")).toString();
    final String outputDir = tempDir.resolve("output").toString();
    generateDate(inputDir);
    final List<String> sortColumns = new ArrayList<>();
    sortColumns.add(ParquetStore.VERTEX);
    sortColumns.add("date");
    // When
    new SortGroupSplit(fs, sparkSession, sortColumns, inputDir, outputDir, CompressionCodecName.GZIP).call();
    // Then
    // - Check output directory exists and contains one Parquet file
    assertTrue(fs.exists(new Path(outputDir)));
    final FileStatus[] outputFiles = fs.listStatus(new Path(outputDir), path1 -> path1.getName().endsWith(".parquet"));
    assertThat(outputFiles).hasSize(1);
    // - Read results and check in correct order
    final Row[] results = (Row[]) sparkSession.read().parquet(outputFiles[0].getPath().toString()).collect();
    for (int i = 0; i < 40; i++) {
        assertEquals((long) i / 2, (long) results[i].getAs(ParquetStore.VERTEX));
        assertEquals('b', ((byte[]) results[i].getAs("byte"))[0]);
        assertEquals(7f, results[i].getAs("float"), 0.01f);
        assertEquals(11L * (i / 2), (long) results[i].getAs("long"));
        assertEquals(13, (int) results[i].getAs("short"));
        if (i % 2 == 0) {
            assertEquals(new Date(100000L).getTime(), (long) results[i].getAs("date"));
        } else {
            assertEquals(new Date(200000L).getTime(), (long) results[i].getAs("date"));
        }
        assertEquals(2, (int) results[i].getAs("count"));
        assertArrayEquals(new String[] { "A", "B", "C" }, (String[]) ((WrappedArray<String>) results[i].getAs("treeSet")).array());
        assertEquals(JavaConversions$.MODULE$.mapAsScalaMap(TestUtils.MERGED_FREQMAP), results[i].getAs("freqMap"));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SparkSession(org.apache.spark.sql.SparkSession) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) Date(java.util.Date) SortGroupSplit(uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.SortGroupSplit) WrappedArray(scala.collection.mutable.WrappedArray) FileSystem(org.apache.hadoop.fs.FileSystem) Row(org.apache.spark.sql.Row) Test(org.junit.jupiter.api.Test)

Aggregations

ArrayList (java.util.ArrayList)1 Date (java.util.Date)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 Row (org.apache.spark.sql.Row)1 SparkSession (org.apache.spark.sql.SparkSession)1 Test (org.junit.jupiter.api.Test)1 WrappedArray (scala.collection.mutable.WrappedArray)1 SortGroupSplit (uk.gov.gchq.gaffer.parquetstore.operation.handler.utilities.SortGroupSplit)1