Search in sources :

Example 1 with Schema

use of org.apache.arrow.vector.types.pojo.Schema in project parquet-mr by apache.

the class SchemaConverter method fromParquet.

/**
 * Creates an Arrow Schema from an Parquet one and returns the mapping
 * @param parquetSchema the provided Parquet Schema
 * @return the mapping between the 2
 */
public SchemaMapping fromParquet(MessageType parquetSchema) {
    List<Type> fields = parquetSchema.getFields();
    List<TypeMapping> mappings = fromParquet(fields);
    List<Field> arrowFields = fields(mappings);
    return new SchemaMapping(new Schema(arrowFields), parquetSchema, mappings);
}
Also used : Field(org.apache.arrow.vector.types.pojo.Field) PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) ArrowType(org.apache.arrow.vector.types.pojo.ArrowType) OriginalType(org.apache.parquet.schema.OriginalType) Schema(org.apache.arrow.vector.types.pojo.Schema) PrimitiveTypeMapping(org.apache.parquet.arrow.schema.SchemaMapping.PrimitiveTypeMapping) RepeatedTypeMapping(org.apache.parquet.arrow.schema.SchemaMapping.RepeatedTypeMapping) UnionTypeMapping(org.apache.parquet.arrow.schema.SchemaMapping.UnionTypeMapping) ListTypeMapping(org.apache.parquet.arrow.schema.SchemaMapping.ListTypeMapping) StructTypeMapping(org.apache.parquet.arrow.schema.SchemaMapping.StructTypeMapping) TypeMapping(org.apache.parquet.arrow.schema.SchemaMapping.TypeMapping)

Example 2 with Schema

use of org.apache.arrow.vector.types.pojo.Schema in project flink by apache.

the class ArrowUtilsTest method testConvertBetweenLogicalTypeAndArrowType.

@Test
public void testConvertBetweenLogicalTypeAndArrowType() {
    Schema schema = ArrowUtils.toArrowSchema(rowType);
    assertEquals(testFields.size(), schema.getFields().size());
    List<Field> fields = schema.getFields();
    for (int i = 0; i < schema.getFields().size(); i++) {
        // verify convert from RowType to ArrowType
        assertEquals(testFields.get(i).f0, fields.get(i).getName());
        assertEquals(testFields.get(i).f2, fields.get(i).getType());
    }
}
Also used : Field(org.apache.arrow.vector.types.pojo.Field) Schema(org.apache.arrow.vector.types.pojo.Schema) Test(org.junit.Test)

Example 3 with Schema

use of org.apache.arrow.vector.types.pojo.Schema in project twister2 by DSC-SPIDAL.

the class BTAllToAll method execute.

@Override
public void execute(Config config, JobAPI.Job job, IWorkerController workerController, IPersistentVolume persistentVolume, IVolatileVolume volatileVolume) {
    this.jobParameters = JobParameters.build(config);
    // create a worker environment
    this.wEnv = WorkerEnvironment.init(config, job, workerController, persistentVolume, volatileVolume);
    LogicalPlanBuilder logicalPlanBuilder = LogicalPlanBuilder.plan(jobParameters.getSources(), jobParameters.getTargets(), wEnv).withFairDistribution();
    RootAllocator rootAllocator = new RootAllocator();
    IntVector intVector = new IntVector("fist", rootAllocator);
    Float8Vector float8Vector = new Float8Vector("second", rootAllocator);
    for (int i = 0; i < 1000; i++) {
        intVector.setSafe(i, i);
        float8Vector.setSafe(i, i);
    }
    intVector.setValueCount(1000);
    float8Vector.setValueCount(1000);
    List<Field> fieldList = Arrays.asList(intVector.getField(), float8Vector.getField());
    Schema schema = new Schema(fieldList);
    Table t = new ArrowTable(schema, Arrays.asList(new FieldVector[] { intVector, float8Vector }));
    allToAll = new ArrowAllToAll(wEnv.getConfig(), wEnv.getWorkerController(), logicalPlanBuilder.getSources(), logicalPlanBuilder.getTargets(), logicalPlanBuilder.build(), wEnv.getCommunicator().nextEdge(), new ArrowReceiver(), schema, rootAllocator);
    for (int i : logicalPlanBuilder.getTargets()) {
        allToAll.insert(t, i);
    }
    for (int s : logicalPlanBuilder.getSourcesOnThisWorker()) {
        allToAll.finish(s);
    }
    while (!allToAll.isComplete()) {
    // wait
    }
}
Also used : Table(edu.iu.dsc.tws.common.table.Table) ArrowTable(edu.iu.dsc.tws.common.table.arrow.ArrowTable) IntVector(org.apache.arrow.vector.IntVector) ArrowAllToAll(edu.iu.dsc.tws.comms.table.ArrowAllToAll) Float8Vector(org.apache.arrow.vector.Float8Vector) Schema(org.apache.arrow.vector.types.pojo.Schema) LogicalPlanBuilder(edu.iu.dsc.tws.comms.utils.LogicalPlanBuilder) FieldVector(org.apache.arrow.vector.FieldVector) Field(org.apache.arrow.vector.types.pojo.Field) RootAllocator(org.apache.arrow.memory.RootAllocator) ArrowTable(edu.iu.dsc.tws.common.table.arrow.ArrowTable)

Example 4 with Schema

use of org.apache.arrow.vector.types.pojo.Schema in project twister2 by DSC-SPIDAL.

the class ArrowTSetSourceExample method execute.

@Override
public void execute(WorkerEnvironment workerEnv) {
    BatchEnvironment env = TSetEnvironment.initBatch(workerEnv);
    Config config = env.getConfig();
    String csvInputDirectory = config.getStringValue(DataObjectConstants.DINPUT_DIRECTORY);
    String arrowInputDirectory = config.getStringValue(DataObjectConstants.ARROW_DIRECTORY);
    String arrowFileName = config.getStringValue(DataObjectConstants.FILE_NAME);
    int workers = config.getIntegerValue(DataObjectConstants.WORKERS);
    int parallel = config.getIntegerValue(DataObjectConstants.PARALLELISM_VALUE);
    int dsize = config.getIntegerValue(DataObjectConstants.DSIZE);
    LOG.info("arrow input file:" + arrowFileName + "\t" + arrowInputDirectory + "\t" + csvInputDirectory + "\t" + workers + "\t" + parallel);
    Schema schema = makeSchema();
    SourceTSet<String[]> csvSource = env.createCSVSource(csvInputDirectory, dsize, parallel, "split");
    SinkTSet<Iterator<Integer>> sinkTSet = csvSource.direct().map((MapFunc<String[], Integer>) input -> Integer.parseInt(input[0])).direct().sink(new ArrowBasedSinkFunction<>(arrowInputDirectory, arrowFileName, schema.toJson()));
    env.run(sinkTSet);
    // Source Function Call
    env.createArrowSource(arrowInputDirectory, arrowFileName, parallel, schema.toJson()).direct().compute((ComputeFunc<Iterator<Object>, List<Integer>>) input -> {
        List<Integer> integers = new ArrayList<>();
        input.forEachRemaining(i -> integers.add((Integer) i));
        return integers;
    }).direct().forEach(s -> LOG.info("Integer Array Size:" + s.size() + "\tvalues:" + s));
}
Also used : Twister2Job(edu.iu.dsc.tws.api.Twister2Job) ArrowBasedSinkFunction(edu.iu.dsc.tws.tset.fn.impl.ArrowBasedSinkFunction) Schema(org.apache.arrow.vector.types.pojo.Schema) ArrowType(org.apache.arrow.vector.types.pojo.ArrowType) ResourceAllocator(edu.iu.dsc.tws.rsched.core.ResourceAllocator) Options(org.apache.commons.cli.Options) BatchEnvironment(edu.iu.dsc.tws.tset.env.BatchEnvironment) HashMap(java.util.HashMap) Config(edu.iu.dsc.tws.api.config.Config) MapFunc(edu.iu.dsc.tws.api.tset.fn.MapFunc) JobConfig(edu.iu.dsc.tws.api.JobConfig) ArrayList(java.util.ArrayList) Level(java.util.logging.Level) DefaultParser(org.apache.commons.cli.DefaultParser) ImmutableList(com.google.common.collect.ImmutableList) CommandLine(org.apache.commons.cli.CommandLine) Iterator(java.util.Iterator) SourceTSet(edu.iu.dsc.tws.tset.sets.batch.SourceTSet) CommandLineParser(org.apache.commons.cli.CommandLineParser) FieldType(org.apache.arrow.vector.types.pojo.FieldType) SinkTSet(edu.iu.dsc.tws.tset.sets.batch.SinkTSet) Field(org.apache.arrow.vector.types.pojo.Field) Logger(java.util.logging.Logger) Utils(edu.iu.dsc.tws.examples.Utils) DataObjectConstants(edu.iu.dsc.tws.data.utils.DataObjectConstants) Serializable(java.io.Serializable) Twister2Submitter(edu.iu.dsc.tws.rsched.job.Twister2Submitter) List(java.util.List) WorkerEnvironment(edu.iu.dsc.tws.api.resource.WorkerEnvironment) TSetEnvironment(edu.iu.dsc.tws.tset.env.TSetEnvironment) ComputeFunc(edu.iu.dsc.tws.api.tset.fn.ComputeFunc) Twister2Worker(edu.iu.dsc.tws.api.resource.Twister2Worker) BatchEnvironment(edu.iu.dsc.tws.tset.env.BatchEnvironment) Config(edu.iu.dsc.tws.api.config.Config) JobConfig(edu.iu.dsc.tws.api.JobConfig) Schema(org.apache.arrow.vector.types.pojo.Schema) Iterator(java.util.Iterator) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List)

Example 5 with Schema

use of org.apache.arrow.vector.types.pojo.Schema in project twister2 by DSC-SPIDAL.

the class ArrowTSetSourceExample method makeSchema.

private Schema makeSchema() {
    ImmutableList.Builder<Field> builder = ImmutableList.builder();
    builder.add(new Field("int", FieldType.nullable(new ArrowType.Int(32, true)), null));
    // builder.add(new Field("long", FieldType.nullable(new ArrowType.Int(64, true)), null));
    return new Schema(builder.build(), null);
}
Also used : Field(org.apache.arrow.vector.types.pojo.Field) ImmutableList(com.google.common.collect.ImmutableList) Schema(org.apache.arrow.vector.types.pojo.Schema) ArrowType(org.apache.arrow.vector.types.pojo.ArrowType)

Aggregations

Schema (org.apache.arrow.vector.types.pojo.Schema)8 Field (org.apache.arrow.vector.types.pojo.Field)5 ArrowType (org.apache.arrow.vector.types.pojo.ArrowType)3 ImmutableList (com.google.common.collect.ImmutableList)2 RootAllocator (org.apache.arrow.memory.RootAllocator)2 Test (org.junit.Test)2 ArrowSchema (com.google.cloud.bigquery.storage.v1.ArrowSchema)1 JobConfig (edu.iu.dsc.tws.api.JobConfig)1 Twister2Job (edu.iu.dsc.tws.api.Twister2Job)1 Config (edu.iu.dsc.tws.api.config.Config)1 Twister2Worker (edu.iu.dsc.tws.api.resource.Twister2Worker)1 WorkerEnvironment (edu.iu.dsc.tws.api.resource.WorkerEnvironment)1 ComputeFunc (edu.iu.dsc.tws.api.tset.fn.ComputeFunc)1 MapFunc (edu.iu.dsc.tws.api.tset.fn.MapFunc)1 Table (edu.iu.dsc.tws.common.table.Table)1 ArrowTable (edu.iu.dsc.tws.common.table.arrow.ArrowTable)1 ArrowAllToAll (edu.iu.dsc.tws.comms.table.ArrowAllToAll)1 LogicalPlanBuilder (edu.iu.dsc.tws.comms.utils.LogicalPlanBuilder)1 DataObjectConstants (edu.iu.dsc.tws.data.utils.DataObjectConstants)1 Utils (edu.iu.dsc.tws.examples.Utils)1