Search in sources :

Example 6 with GeneratedRowWriter

use of com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter in project aws-athena-query-federation by awslabs.

the class ExampleRecordHandler method readWithConstraint.

/**
 * Used to read the row data associated with the provided Split.
 *
 * @param spiller A BlockSpiller that should be used to write the row data associated with this Split.
 * The BlockSpiller automatically handles chunking the response, encrypting, and spilling to S3.
 * @param recordsRequest Details of the read request, including:
 * 1. The Split
 * 2. The Catalog, Database, and Table the read request is for.
 * 3. The filtering predicate (if any)
 * 4. The columns required for projection.
 * @param queryStatusChecker A QueryStatusChecker that you can use to stop doing work for a query that has already terminated
 * @throws IOException
 * @note Avoid writing >10 rows per-call to BlockSpiller.writeRow(...) because this will limit the BlockSpiller's
 * ability to control Block size. The resulting increase in Block size may cause failures and reduced performance.
 */
@Override
protected void readWithConstraint(BlockSpiller spiller, ReadRecordsRequest recordsRequest, QueryStatusChecker queryStatusChecker) throws IOException {
    logger.info("readWithConstraint: enter - " + recordsRequest.getSplit());
    Split split = recordsRequest.getSplit();
    int splitYear = 0;
    int splitMonth = 0;
    int splitDay = 0;
    /**
     * TODO: Extract information about what we need to read from the split. If you are following the tutorial
     *  this is basically the partition column values for year, month, day.
     *
     *         splitYear = split.getPropertyAsInt("year");
     *         splitMonth = split.getPropertyAsInt("month");
     *         splitDay = split.getPropertyAsInt("day");
     */
    String dataBucket = null;
    /**
     * TODO: Get the data bucket from the env variable set by athena-example.yaml
     *
     *         dataBucket = System.getenv("data_bucket");
     */
    String dataKey = format("%s/%s/%s/sample_data.csv", splitYear, splitMonth, splitDay);
    BufferedReader s3Reader = openS3File(dataBucket, dataKey);
    if (s3Reader == null) {
        // There is no data to read for this split.
        return;
    }
    GeneratedRowWriter.RowWriterBuilder builder = GeneratedRowWriter.newBuilder(recordsRequest.getConstraints());
    /**
     * TODO: Add extractors for each field to our RowWRiterBuilder, the RowWriterBuilder will then 'generate'
     * optomized code for converting our data to Apache Arrow, automatically minimizing memory overhead, code
     * branches, etc... Later in the code when we call RowWriter for each line in our S3 file
     *
     *         builder.withExtractor("year", (IntExtractor) (Object context, NullableIntHolder value) -> {
     *             value.isSet = 1;
     *             value.value = Integer.parseInt(((String[]) context)[0]);
     *         });
     *
     *         builder.withExtractor("month", (IntExtractor) (Object context, NullableIntHolder value) -> {
     *             value.isSet = 1;
     *             value.value = Integer.parseInt(((String[]) context)[1]);
     *         });
     *
     *         builder.withExtractor("day", (IntExtractor) (Object context, NullableIntHolder value) -> {
     *             value.isSet = 1;
     *             value.value = Integer.parseInt(((String[]) context)[2]);
     *         });
     *
     *         builder.withExtractor("encrypted_payload", (VarCharExtractor) (Object context, NullableVarCharHolder value) -> {
     *             value.isSet = 1;
     *             value.value = ((String[]) context)[6];
     *         });
     */
    /**
     * TODO: The account_id field is a sensitive field, so we'd like to mask it to the last 4 before
     *  returning it to Athena. Note that this will mean you can only filter (where/having)
     *  on the masked value from Athena.
     *
     *         builder.withExtractor("account_id", (VarCharExtractor) (Object context, NullableVarCharHolder value) -> {
     *             value.isSet = 1;
     *             String accountId = ((String[]) context)[3];
     *             value.value = accountId.length() > 4 ? accountId.substring(accountId.length() - 4) : accountId;
     *         });
     */
    /**
     * TODO: Write data for our transaction STRUCT:
     * For complex types like List and Struct, we can build a Map to conveniently set nested values
     *
     *         builder.withFieldWriterFactory("transaction",
     *                (FieldVector vector, Extractor extractor, ConstraintProjector constraint) ->
     *                    (Object context, int rowNum) -> {
     *                         Map<String, Object> eventMap = new HashMap<>();
     *                         eventMap.put("id", Integer.parseInt(((String[])context)[4]));
     *                         eventMap.put("completed", Boolean.parseBoolean(((String[])context)[5]));
     *                         BlockUtils.setComplexValue(vector, rowNum, FieldResolver.DEFAULT, eventMap);
     *                         return true;    //we don't yet support predicate pushdown on complex types
     *         });
     */
    // Used some basic code-gen to optimize how we generate response data.
    GeneratedRowWriter rowWriter = builder.build();
    // We read the transaction data line by line from our S3 object.
    String line;
    while ((line = s3Reader.readLine()) != null) {
        logger.info("readWithConstraint: processing line " + line);
        // The sample_data.csv file is structured as year,month,day,account_id,transaction.id,transaction.complete
        String[] lineParts = line.split(",");
        // We use the provided BlockSpiller to write our row data into the response. This utility is provided by
        // the Amazon Athena Query Federation SDK and automatically handles breaking the data into reasonably sized
        // chunks, encrypting it, and spilling to S3 if we've enabled these features.
        spiller.writeRows((Block block, int rowNum) -> rowWriter.writeRow(block, rowNum, lineParts) ? 1 : 0);
    }
}
Also used : GeneratedRowWriter(com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter) BufferedReader(java.io.BufferedReader) Block(com.amazonaws.athena.connector.lambda.data.Block) Split(com.amazonaws.athena.connector.lambda.domain.Split)

Example 7 with GeneratedRowWriter

use of com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter in project aws-athena-query-federation by awslabs.

the class ExampleRecordHandler method readWithConstraint.

/**
 * Here we generate our simulated row data. A real connector would instead connect to the actual source and read
 * the data corresponding to the requested split.
 *
 * @param spiller A BlockSpiller that should be used to write the row data associated with this Split.
 * The BlockSpiller automatically handles applying constraints, chunking the response, encrypting, and spilling to S3.
 * @param request The ReadRecordsRequest containing the split and other details about what to read.
 * @param queryStatusChecker A QueryStatusChecker that you can use to stop doing work for a query that has already terminated
 */
@Override
protected void readWithConstraint(BlockSpiller spiller, ReadRecordsRequest request, QueryStatusChecker queryStatusChecker) {
    long startTime = System.currentTimeMillis();
    /**
     * It is important to try and throw any throttling events before writing data since Athena may not be able to
     * continue the query, due to consistency errors, if you throttle after writing data.
     */
    if (simulateThrottle > 0 && count++ % simulateThrottle == 0) {
        logger.info("readWithConstraint: throwing throttle Exception!");
        throw new FederationThrottleException("Please slow down for this simulated throttling event");
    }
    logCaller(request);
    Set<String> partitionCols = new HashSet<>();
    String partitionColsMetadata = request.getSchema().getCustomMetadata().get("partitionCols");
    if (partitionColsMetadata != null) {
        partitionCols.addAll(Arrays.asList(partitionColsMetadata.split(",")));
    }
    int year = Integer.valueOf(request.getSplit().getProperty("year"));
    int month = Integer.valueOf(request.getSplit().getProperty("month"));
    int day = Integer.valueOf(request.getSplit().getProperty("day"));
    final RowContext rowContext = new RowContext(year, month, day);
    GeneratedRowWriter.RowWriterBuilder builder = GeneratedRowWriter.newBuilder(request.getConstraints());
    for (Field next : request.getSchema().getFields()) {
        Extractor extractor = makeExtractor(next, rowContext);
        if (extractor != null) {
            builder.withExtractor(next.getName(), extractor);
        } else {
            builder.withFieldWriterFactory(next.getName(), makeFactory(next, rowContext));
        }
    }
    GeneratedRowWriter rowWriter = builder.build();
    for (int i = 0; i < numRowsPerSplit; i++) {
        rowContext.seed = i;
        rowContext.negative = i % 2 == 0;
        if (!queryStatusChecker.isQueryRunning()) {
            return;
        }
        spiller.writeRows((Block block, int rowNum) -> rowWriter.writeRow(block, rowNum, rowContext) ? 1 : 0);
    }
    logger.info("readWithConstraint: Completed generating rows in {} ms", System.currentTimeMillis() - startTime);
}
Also used : Field(org.apache.arrow.vector.types.pojo.Field) GeneratedRowWriter(com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter) Block(com.amazonaws.athena.connector.lambda.data.Block) FederationThrottleException(com.amazonaws.athena.connector.lambda.exceptions.FederationThrottleException) BigIntExtractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.BigIntExtractor) DecimalExtractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.DecimalExtractor) DateDayExtractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.DateDayExtractor) TinyIntExtractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.TinyIntExtractor) VarBinaryExtractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.VarBinaryExtractor) BitExtractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.BitExtractor) IntExtractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.IntExtractor) Extractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.Extractor) Float8Extractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.Float8Extractor) SmallIntExtractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.SmallIntExtractor) VarCharExtractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.VarCharExtractor) Float4Extractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.Float4Extractor) DateMilliExtractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.DateMilliExtractor) HashSet(java.util.HashSet)

Example 8 with GeneratedRowWriter

use of com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter in project aws-athena-query-federation by awslabs.

the class UserDefinedFunctionHandler method processRows.

/**
 * Processes a group by rows. This method takes in a block of data (containing multiple rows), process them and
 * returns multiple rows of the output column in a block.
 * <p>
 * UDF methods are invoked row-by-row in a for loop. Arrow values are converted to Java Objects and then passed into
 * the UDF java method. This is not very efficient because we might potentially be doing a lot of data copying.
 * Advanced users could choose to override this method and directly deal with Arrow data to achieve better
 * performance.
 *
 * @param allocator arrow memory allocator
 * @param udfMethod the extracted java method matching the User-Defined-Function defined in Athena.
 * @param inputRecords input data in Arrow format
 * @param outputSchema output data schema in Arrow format
 * @return output data in Arrow format
 */
protected Block processRows(BlockAllocator allocator, Method udfMethod, Block inputRecords, Schema outputSchema) throws Exception {
    int rowCount = inputRecords.getRowCount();
    List<ArrowValueProjector> valueProjectors = Lists.newArrayList();
    for (Field field : inputRecords.getFields()) {
        FieldReader fieldReader = inputRecords.getFieldReader(field.getName());
        ArrowValueProjector arrowValueProjector = ProjectorUtils.createArrowValueProjector(fieldReader);
        valueProjectors.add(arrowValueProjector);
    }
    Field outputField = outputSchema.getFields().get(0);
    GeneratedRowWriter outputRowWriter = createOutputRowWriter(outputField, valueProjectors, udfMethod);
    Block outputRecords = allocator.createBlock(outputSchema);
    outputRecords.setRowCount(rowCount);
    try {
        for (int rowNum = 0; rowNum < rowCount; ++rowNum) {
            outputRowWriter.writeRow(outputRecords, rowNum, rowNum);
        }
    } catch (Throwable t) {
        try {
            outputRecords.close();
        } catch (Exception e) {
            logger.error("Error closing output block", e);
        }
        throw t;
    }
    return outputRecords;
}
Also used : Field(org.apache.arrow.vector.types.pojo.Field) GeneratedRowWriter(com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter) ArrowValueProjector(com.amazonaws.athena.connector.lambda.data.projectors.ArrowValueProjector) Block(com.amazonaws.athena.connector.lambda.data.Block) FieldReader(org.apache.arrow.vector.complex.reader.FieldReader) InvocationTargetException(java.lang.reflect.InvocationTargetException)

Example 9 with GeneratedRowWriter

use of com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter in project aws-athena-query-federation by awslabs.

the class UserDefinedFunctionHandler method createOutputRowWriter.

private GeneratedRowWriter createOutputRowWriter(Field outputField, List<ArrowValueProjector> valueProjectors, Method udfMethod) {
    GeneratedRowWriter.RowWriterBuilder builder = GeneratedRowWriter.newBuilder();
    Extractor extractor = makeExtractor(outputField, valueProjectors, udfMethod);
    if (extractor != null) {
        builder.withExtractor(outputField.getName(), extractor);
    } else {
        builder.withFieldWriterFactory(outputField.getName(), makeFactory(outputField, valueProjectors, udfMethod));
    }
    return builder.build();
}
Also used : GeneratedRowWriter(com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter) BigIntExtractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.BigIntExtractor) DecimalExtractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.DecimalExtractor) DateDayExtractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.DateDayExtractor) TinyIntExtractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.TinyIntExtractor) VarBinaryExtractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.VarBinaryExtractor) BitExtractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.BitExtractor) IntExtractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.IntExtractor) Extractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.Extractor) Float8Extractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.Float8Extractor) SmallIntExtractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.SmallIntExtractor) VarCharExtractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.VarCharExtractor) Float4Extractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.Float4Extractor) DateMilliExtractor(com.amazonaws.athena.connector.lambda.data.writers.extractors.DateMilliExtractor)

Example 10 with GeneratedRowWriter

use of com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter in project aws-athena-query-federation by awslabs.

the class PropertyGraphHandler method executeQuery.

/**
 * Used to read the row data associated with the provided Split.
 *
 * @param spiller            A BlockSpiller that should be used to write the row
 *                           data associated with this Split. The BlockSpiller
 *                           automatically handles chunking the response,
 *                           encrypting, and spilling to S3.
 * @param recordsRequest     Details of the read request, including: 1. The
 *                           Split 2. The Catalog, Database, and Table the read
 *                           request is for. 3. The filtering predicate (if any)
 *                           4. The columns required for projection.
 * @param queryStatusChecker A QueryStatusChecker that you can use to stop doing
 *                           work for a query that has already terminated
 * @throws Exception
 * @note Avoid writing >10 rows per-call to BlockSpiller.writeRow(...) because
 *       this will limit the BlockSpiller's ability to control Block size. The
 *       resulting increase in Block size may cause failures and reduced
 *       performance.
 */
public void executeQuery(ReadRecordsRequest recordsRequest, final QueryStatusChecker queryStatusChecker, final BlockSpiller spiller) throws Exception {
    logger.debug("readWithConstraint: enter - " + recordsRequest.getSplit());
    long numRows = 0;
    Client client = neptuneConnection.getNeptuneClientConnection();
    GraphTraversalSource graphTraversalSource = neptuneConnection.getTraversalSource(client);
    GraphTraversal graphTraversal = null;
    String labelName = recordsRequest.getTableName().getTableName();
    GeneratedRowWriter.RowWriterBuilder builder = GeneratedRowWriter.newBuilder(recordsRequest.getConstraints());
    String type = recordsRequest.getSchema().getCustomMetadata().get("componenttype");
    TableSchemaMetaType tableSchemaMetaType = TableSchemaMetaType.valueOf(type.toUpperCase());
    logger.debug("readWithConstraint: schema type is " + tableSchemaMetaType.toString());
    if (tableSchemaMetaType != null) {
        switch(tableSchemaMetaType) {
            case VERTEX:
                graphTraversal = graphTraversalSource.V().hasLabel(labelName);
                getQueryPartForContraintsMap(graphTraversal, recordsRequest);
                graphTraversal = graphTraversal.valueMap().with(WithOptions.tokens);
                for (final Field nextField : recordsRequest.getSchema().getFields()) {
                    VertexRowWriter.writeRowTemplate(builder, nextField);
                }
                break;
            case EDGE:
                graphTraversal = graphTraversalSource.E().hasLabel(labelName);
                getQueryPartForContraintsMap(graphTraversal, recordsRequest);
                graphTraversal = graphTraversal.elementMap();
                for (final Field nextField : recordsRequest.getSchema().getFields()) {
                    EdgeRowWriter.writeRowTemplate(builder, nextField);
                }
                break;
        }
    }
    // log string equivalent of gremlin query
    logger.debug("readWithConstraint: enter - " + GroovyTranslator.of("g").translate(graphTraversal.asAdmin().getBytecode()));
    final GraphTraversal graphTraversalFinal1 = graphTraversal;
    final GeneratedRowWriter rowWriter1 = builder.build();
    while (graphTraversalFinal1.hasNext() && queryStatusChecker.isQueryRunning()) {
        numRows++;
        spiller.writeRows((final Block block, final int rowNum) -> {
            final Map obj = (Map) graphTraversalFinal1.next();
            return (rowWriter1.writeRow(block, rowNum, (Object) obj) ? 1 : 0);
        });
    }
    logger.info("readWithConstraint: numRows[{}]", numRows);
}
Also used : GraphTraversalSource(org.apache.tinkerpop.gremlin.process.traversal.dsl.graph.GraphTraversalSource) TableSchemaMetaType(com.amazonaws.athena.connectors.neptune.propertygraph.Enums.TableSchemaMetaType) Field(org.apache.arrow.vector.types.pojo.Field) GeneratedRowWriter(com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter) GraphTraversal(org.apache.tinkerpop.gremlin.process.traversal.dsl.graph.GraphTraversal) Block(com.amazonaws.athena.connector.lambda.data.Block) Client(org.apache.tinkerpop.gremlin.driver.Client) Map(java.util.Map)

Aggregations

GeneratedRowWriter (com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter)10 Block (com.amazonaws.athena.connector.lambda.data.Block)8 Field (org.apache.arrow.vector.types.pojo.Field)6 BigIntExtractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.BigIntExtractor)2 BitExtractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.BitExtractor)2 DateDayExtractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.DateDayExtractor)2 DateMilliExtractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.DateMilliExtractor)2 DecimalExtractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.DecimalExtractor)2 Extractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.Extractor)2 Float4Extractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.Float4Extractor)2 Float8Extractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.Float8Extractor)2 IntExtractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.IntExtractor)2 SmallIntExtractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.SmallIntExtractor)2 TinyIntExtractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.TinyIntExtractor)2 VarBinaryExtractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.VarBinaryExtractor)2 VarCharExtractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.VarCharExtractor)2 Split (com.amazonaws.athena.connector.lambda.domain.Split)2 Row (com.amazonaws.services.timestreamquery.model.Row)2 BufferedReader (java.io.BufferedReader)2 IOException (java.io.IOException)2