Search in sources :

Example 1 with StringParquetSerialiser

use of uk.gov.gchq.gaffer.parquetstore.serialisation.impl.StringParquetSerialiser in project Gaffer by gchq.

the class WriteDataTest method testTwoWritesToSamePartitionDoesntThrowException.

@Test
public void testTwoWritesToSamePartitionDoesntThrowException(@TempDir java.nio.file.Path tempDir) throws Exception {
    // Given
    final Schema schema = new Schema.Builder().type("int", new TypeDefinition.Builder().clazz(Integer.class).serialiser(new IntegerParquetSerialiser()).build()).type("string", new TypeDefinition.Builder().clazz(String.class).serialiser(new StringParquetSerialiser()).build()).entity("entity", new SchemaEntityDefinition.Builder().vertex("string").property("property1", "int").aggregate(false).build()).edge("edge", new SchemaEdgeDefinition.Builder().source("string").destination("string").property("property2", "int").aggregate(false).build()).vertexSerialiser(new StringParquetSerialiser()).build();
    final Function<String, String> groupToDirectory = group -> tempDir.toAbsolutePath().toString() + "/" + group;
    final List<Element> elements = new ArrayList<>();
    elements.add(new Entity.Builder().group("entity").vertex("A").property("property1", 1).build());
    elements.add(new Edge.Builder().group("edge").source("B").dest("C").property("property2", 100).build());
    final WriteData writeData = new WriteData(groupToDirectory, schema, CompressionCodecName.GZIP);
    final FileSystem fileSystem = FileSystem.get(new Configuration());
    // When
    final ExecutorService executorService = Executors.newFixedThreadPool(3);
    final List<Callable<Void>> tasks = new ArrayList<>();
    LongStream.range(1000L, 1003L).forEach(l -> {
        tasks.add(() -> {
            writeData.call(elements.iterator(), 1, l);
            return null;
        });
    });
    executorService.invokeAll(tasks);
    // Then
    // - Check that a file named with the partition id has been created
    assertTrue(fileSystem.exists(new Path(groupToDirectory.apply("entity") + "/" + "input-1.parquet")));
    assertTrue(fileSystem.exists(new Path(groupToDirectory.apply("edge") + "/" + "input-1.parquet")));
}
Also used : TypeDefinition(uk.gov.gchq.gaffer.store.schema.TypeDefinition) LongStream(java.util.stream.LongStream) FileSystem(org.apache.hadoop.fs.FileSystem) StringParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.StringParquetSerialiser) SchemaEdgeDefinition(uk.gov.gchq.gaffer.store.schema.SchemaEdgeDefinition) Callable(java.util.concurrent.Callable) Entity(uk.gov.gchq.gaffer.data.element.Entity) Function(java.util.function.Function) Element(uk.gov.gchq.gaffer.data.element.Element) Executors(java.util.concurrent.Executors) ArrayList(java.util.ArrayList) Test(org.junit.jupiter.api.Test) IntegerParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.IntegerParquetSerialiser) List(java.util.List) Schema(uk.gov.gchq.gaffer.store.schema.Schema) SchemaEntityDefinition(uk.gov.gchq.gaffer.store.schema.SchemaEntityDefinition) TempDir(org.junit.jupiter.api.io.TempDir) Configuration(org.apache.hadoop.conf.Configuration) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) Path(org.apache.hadoop.fs.Path) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) Edge(uk.gov.gchq.gaffer.data.element.Edge) ExecutorService(java.util.concurrent.ExecutorService) Path(org.apache.hadoop.fs.Path) StringParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.StringParquetSerialiser) Entity(uk.gov.gchq.gaffer.data.element.Entity) Configuration(org.apache.hadoop.conf.Configuration) Schema(uk.gov.gchq.gaffer.store.schema.Schema) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) Callable(java.util.concurrent.Callable) TypeDefinition(uk.gov.gchq.gaffer.store.schema.TypeDefinition) FileSystem(org.apache.hadoop.fs.FileSystem) ExecutorService(java.util.concurrent.ExecutorService) IntegerParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.IntegerParquetSerialiser) Test(org.junit.jupiter.api.Test)

Example 2 with StringParquetSerialiser

use of uk.gov.gchq.gaffer.parquetstore.serialisation.impl.StringParquetSerialiser in project Gaffer by gchq.

the class ParquetStoreTest method shouldCorrectlyUseCompressionOption.

@Test
public void shouldCorrectlyUseCompressionOption(@TempDir java.nio.file.Path tempDir) throws Exception {
    for (final String compressionType : Sets.newHashSet("GZIP", "SNAPPY", "UNCOMPRESSED")) {
        // Given
        final Schema schema = new Schema.Builder().type("int", new TypeDefinition.Builder().clazz(Integer.class).serialiser(new IntegerParquetSerialiser()).build()).type("string", new TypeDefinition.Builder().clazz(String.class).serialiser(new StringParquetSerialiser()).build()).type(DIRECTED_EITHER, Boolean.class).entity("entity", new SchemaEntityDefinition.Builder().vertex("string").property("property1", "int").aggregate(false).build()).edge("edge", new SchemaEdgeDefinition.Builder().source("string").destination("string").property("property2", "int").directed(DIRECTED_EITHER).aggregate(false).build()).vertexSerialiser(new StringParquetSerialiser()).build();
        final ParquetStoreProperties parquetStoreProperties = TestUtils.getParquetStoreProperties(tempDir);
        parquetStoreProperties.setCompressionCodecName(compressionType);
        final ParquetStore parquetStore = (ParquetStore) ParquetStore.createStore("graphId", schema, parquetStoreProperties);
        final List<Element> elements = new ArrayList<>();
        elements.add(new Entity.Builder().group("entity").vertex("A").property("property1", 1).build());
        elements.add(new Edge.Builder().group("edge").source("B").dest("C").property("property2", 100).build());
        // When
        final AddElements add = new AddElements.Builder().input(elements).build();
        parquetStore.execute(add, new Context());
        // Then
        final List<Path> files = parquetStore.getFilesForGroup("entity");
        for (final Path path : files) {
            final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(new Configuration(), path, ParquetMetadataConverter.NO_FILTER);
            for (final BlockMetaData blockMetadata : parquetMetadata.getBlocks()) {
                blockMetadata.getColumns().forEach(c -> assertEquals(compressionType, c.getCodec().name()));
            }
        }
    }
}
Also used : AddElements(uk.gov.gchq.gaffer.operation.impl.add.AddElements) Entity(uk.gov.gchq.gaffer.data.element.Entity) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) Schema(uk.gov.gchq.gaffer.store.schema.Schema) Element(uk.gov.gchq.gaffer.data.element.Element) ArrayList(java.util.ArrayList) TypeDefinition(uk.gov.gchq.gaffer.store.schema.TypeDefinition) IntegerParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.IntegerParquetSerialiser) Context(uk.gov.gchq.gaffer.store.Context) Path(org.apache.hadoop.fs.Path) StringParquetSerialiser(uk.gov.gchq.gaffer.parquetstore.serialisation.impl.StringParquetSerialiser) TestUtils.getParquetStoreProperties(uk.gov.gchq.gaffer.parquetstore.testutils.TestUtils.getParquetStoreProperties) SchemaEdgeDefinition(uk.gov.gchq.gaffer.store.schema.SchemaEdgeDefinition) Test(org.junit.jupiter.api.Test)

Aggregations

ArrayList (java.util.ArrayList)2 Configuration (org.apache.hadoop.conf.Configuration)2 Path (org.apache.hadoop.fs.Path)2 Test (org.junit.jupiter.api.Test)2 Element (uk.gov.gchq.gaffer.data.element.Element)2 Entity (uk.gov.gchq.gaffer.data.element.Entity)2 IntegerParquetSerialiser (uk.gov.gchq.gaffer.parquetstore.serialisation.impl.IntegerParquetSerialiser)2 StringParquetSerialiser (uk.gov.gchq.gaffer.parquetstore.serialisation.impl.StringParquetSerialiser)2 Schema (uk.gov.gchq.gaffer.store.schema.Schema)2 SchemaEdgeDefinition (uk.gov.gchq.gaffer.store.schema.SchemaEdgeDefinition)2 TypeDefinition (uk.gov.gchq.gaffer.store.schema.TypeDefinition)2 List (java.util.List)1 Callable (java.util.concurrent.Callable)1 ExecutorService (java.util.concurrent.ExecutorService)1 Executors (java.util.concurrent.Executors)1 Function (java.util.function.Function)1 LongStream (java.util.stream.LongStream)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)1 CompressionCodecName (org.apache.parquet.hadoop.metadata.CompressionCodecName)1