use of com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter in project aws-athena-query-federation by awslabs.
the class ExampleRecordHandler method readWithConstraint.
/**
* Used to read the row data associated with the provided Split.
*
* @param spiller A BlockSpiller that should be used to write the row data associated with this Split.
* The BlockSpiller automatically handles chunking the response, encrypting, and spilling to S3.
* @param recordsRequest Details of the read request, including:
* 1. The Split
* 2. The Catalog, Database, and Table the read request is for.
* 3. The filtering predicate (if any)
* 4. The columns required for projection.
* @param queryStatusChecker A QueryStatusChecker that you can use to stop doing work for a query that has already terminated
* @throws IOException
* @note Avoid writing >10 rows per-call to BlockSpiller.writeRow(...) because this will limit the BlockSpiller's
* ability to control Block size. The resulting increase in Block size may cause failures and reduced performance.
*/
@Override
protected void readWithConstraint(BlockSpiller spiller, ReadRecordsRequest recordsRequest, QueryStatusChecker queryStatusChecker) throws IOException {
logger.info("readWithConstraint: enter - " + recordsRequest.getSplit());
Split split = recordsRequest.getSplit();
int splitYear = 0;
int splitMonth = 0;
int splitDay = 0;
/**
* TODO: Extract information about what we need to read from the split. If you are following the tutorial
* this is basically the partition column values for year, month, day.
*
* splitYear = split.getPropertyAsInt("year");
* splitMonth = split.getPropertyAsInt("month");
* splitDay = split.getPropertyAsInt("day");
*/
String dataBucket = null;
/**
* TODO: Get the data bucket from the env variable set by athena-example.yaml
*
* dataBucket = System.getenv("data_bucket");
*/
String dataKey = format("%s/%s/%s/sample_data.csv", splitYear, splitMonth, splitDay);
BufferedReader s3Reader = openS3File(dataBucket, dataKey);
if (s3Reader == null) {
// There is no data to read for this split.
return;
}
GeneratedRowWriter.RowWriterBuilder builder = GeneratedRowWriter.newBuilder(recordsRequest.getConstraints());
/**
* TODO: Add extractors for each field to our RowWRiterBuilder, the RowWriterBuilder will then 'generate'
* optomized code for converting our data to Apache Arrow, automatically minimizing memory overhead, code
* branches, etc... Later in the code when we call RowWriter for each line in our S3 file
*
* builder.withExtractor("year", (IntExtractor) (Object context, NullableIntHolder value) -> {
* value.isSet = 1;
* value.value = Integer.parseInt(((String[]) context)[0]);
* });
*
* builder.withExtractor("month", (IntExtractor) (Object context, NullableIntHolder value) -> {
* value.isSet = 1;
* value.value = Integer.parseInt(((String[]) context)[1]);
* });
*
* builder.withExtractor("day", (IntExtractor) (Object context, NullableIntHolder value) -> {
* value.isSet = 1;
* value.value = Integer.parseInt(((String[]) context)[2]);
* });
*
* builder.withExtractor("encrypted_payload", (VarCharExtractor) (Object context, NullableVarCharHolder value) -> {
* value.isSet = 1;
* value.value = ((String[]) context)[6];
* });
*/
/**
* TODO: The account_id field is a sensitive field, so we'd like to mask it to the last 4 before
* returning it to Athena. Note that this will mean you can only filter (where/having)
* on the masked value from Athena.
*
* builder.withExtractor("account_id", (VarCharExtractor) (Object context, NullableVarCharHolder value) -> {
* value.isSet = 1;
* String accountId = ((String[]) context)[3];
* value.value = accountId.length() > 4 ? accountId.substring(accountId.length() - 4) : accountId;
* });
*/
/**
* TODO: Write data for our transaction STRUCT:
* For complex types like List and Struct, we can build a Map to conveniently set nested values
*
* builder.withFieldWriterFactory("transaction",
* (FieldVector vector, Extractor extractor, ConstraintProjector constraint) ->
* (Object context, int rowNum) -> {
* Map<String, Object> eventMap = new HashMap<>();
* eventMap.put("id", Integer.parseInt(((String[])context)[4]));
* eventMap.put("completed", Boolean.parseBoolean(((String[])context)[5]));
* BlockUtils.setComplexValue(vector, rowNum, FieldResolver.DEFAULT, eventMap);
* return true; //we don't yet support predicate pushdown on complex types
* });
*/
// Used some basic code-gen to optimize how we generate response data.
GeneratedRowWriter rowWriter = builder.build();
// We read the transaction data line by line from our S3 object.
String line;
while ((line = s3Reader.readLine()) != null) {
logger.info("readWithConstraint: processing line " + line);
// The sample_data.csv file is structured as year,month,day,account_id,transaction.id,transaction.complete
String[] lineParts = line.split(",");
// We use the provided BlockSpiller to write our row data into the response. This utility is provided by
// the Amazon Athena Query Federation SDK and automatically handles breaking the data into reasonably sized
// chunks, encrypting it, and spilling to S3 if we've enabled these features.
spiller.writeRows((Block block, int rowNum) -> rowWriter.writeRow(block, rowNum, lineParts) ? 1 : 0);
}
}
use of com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter in project aws-athena-query-federation by awslabs.
the class ExampleRecordHandler method readWithConstraint.
/**
* Here we generate our simulated row data. A real connector would instead connect to the actual source and read
* the data corresponding to the requested split.
*
* @param spiller A BlockSpiller that should be used to write the row data associated with this Split.
* The BlockSpiller automatically handles applying constraints, chunking the response, encrypting, and spilling to S3.
* @param request The ReadRecordsRequest containing the split and other details about what to read.
* @param queryStatusChecker A QueryStatusChecker that you can use to stop doing work for a query that has already terminated
*/
@Override
protected void readWithConstraint(BlockSpiller spiller, ReadRecordsRequest request, QueryStatusChecker queryStatusChecker) {
long startTime = System.currentTimeMillis();
/**
* It is important to try and throw any throttling events before writing data since Athena may not be able to
* continue the query, due to consistency errors, if you throttle after writing data.
*/
if (simulateThrottle > 0 && count++ % simulateThrottle == 0) {
logger.info("readWithConstraint: throwing throttle Exception!");
throw new FederationThrottleException("Please slow down for this simulated throttling event");
}
logCaller(request);
Set<String> partitionCols = new HashSet<>();
String partitionColsMetadata = request.getSchema().getCustomMetadata().get("partitionCols");
if (partitionColsMetadata != null) {
partitionCols.addAll(Arrays.asList(partitionColsMetadata.split(",")));
}
int year = Integer.valueOf(request.getSplit().getProperty("year"));
int month = Integer.valueOf(request.getSplit().getProperty("month"));
int day = Integer.valueOf(request.getSplit().getProperty("day"));
final RowContext rowContext = new RowContext(year, month, day);
GeneratedRowWriter.RowWriterBuilder builder = GeneratedRowWriter.newBuilder(request.getConstraints());
for (Field next : request.getSchema().getFields()) {
Extractor extractor = makeExtractor(next, rowContext);
if (extractor != null) {
builder.withExtractor(next.getName(), extractor);
} else {
builder.withFieldWriterFactory(next.getName(), makeFactory(next, rowContext));
}
}
GeneratedRowWriter rowWriter = builder.build();
for (int i = 0; i < numRowsPerSplit; i++) {
rowContext.seed = i;
rowContext.negative = i % 2 == 0;
if (!queryStatusChecker.isQueryRunning()) {
return;
}
spiller.writeRows((Block block, int rowNum) -> rowWriter.writeRow(block, rowNum, rowContext) ? 1 : 0);
}
logger.info("readWithConstraint: Completed generating rows in {} ms", System.currentTimeMillis() - startTime);
}
use of com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter in project aws-athena-query-federation by awslabs.
the class UserDefinedFunctionHandler method processRows.
/**
* Processes a group by rows. This method takes in a block of data (containing multiple rows), process them and
* returns multiple rows of the output column in a block.
* <p>
* UDF methods are invoked row-by-row in a for loop. Arrow values are converted to Java Objects and then passed into
* the UDF java method. This is not very efficient because we might potentially be doing a lot of data copying.
* Advanced users could choose to override this method and directly deal with Arrow data to achieve better
* performance.
*
* @param allocator arrow memory allocator
* @param udfMethod the extracted java method matching the User-Defined-Function defined in Athena.
* @param inputRecords input data in Arrow format
* @param outputSchema output data schema in Arrow format
* @return output data in Arrow format
*/
protected Block processRows(BlockAllocator allocator, Method udfMethod, Block inputRecords, Schema outputSchema) throws Exception {
int rowCount = inputRecords.getRowCount();
List<ArrowValueProjector> valueProjectors = Lists.newArrayList();
for (Field field : inputRecords.getFields()) {
FieldReader fieldReader = inputRecords.getFieldReader(field.getName());
ArrowValueProjector arrowValueProjector = ProjectorUtils.createArrowValueProjector(fieldReader);
valueProjectors.add(arrowValueProjector);
}
Field outputField = outputSchema.getFields().get(0);
GeneratedRowWriter outputRowWriter = createOutputRowWriter(outputField, valueProjectors, udfMethod);
Block outputRecords = allocator.createBlock(outputSchema);
outputRecords.setRowCount(rowCount);
try {
for (int rowNum = 0; rowNum < rowCount; ++rowNum) {
outputRowWriter.writeRow(outputRecords, rowNum, rowNum);
}
} catch (Throwable t) {
try {
outputRecords.close();
} catch (Exception e) {
logger.error("Error closing output block", e);
}
throw t;
}
return outputRecords;
}
use of com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter in project aws-athena-query-federation by awslabs.
the class UserDefinedFunctionHandler method createOutputRowWriter.
private GeneratedRowWriter createOutputRowWriter(Field outputField, List<ArrowValueProjector> valueProjectors, Method udfMethod) {
GeneratedRowWriter.RowWriterBuilder builder = GeneratedRowWriter.newBuilder();
Extractor extractor = makeExtractor(outputField, valueProjectors, udfMethod);
if (extractor != null) {
builder.withExtractor(outputField.getName(), extractor);
} else {
builder.withFieldWriterFactory(outputField.getName(), makeFactory(outputField, valueProjectors, udfMethod));
}
return builder.build();
}
use of com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter in project aws-athena-query-federation by awslabs.
the class PropertyGraphHandler method executeQuery.
/**
* Used to read the row data associated with the provided Split.
*
* @param spiller A BlockSpiller that should be used to write the row
* data associated with this Split. The BlockSpiller
* automatically handles chunking the response,
* encrypting, and spilling to S3.
* @param recordsRequest Details of the read request, including: 1. The
* Split 2. The Catalog, Database, and Table the read
* request is for. 3. The filtering predicate (if any)
* 4. The columns required for projection.
* @param queryStatusChecker A QueryStatusChecker that you can use to stop doing
* work for a query that has already terminated
* @throws Exception
* @note Avoid writing >10 rows per-call to BlockSpiller.writeRow(...) because
* this will limit the BlockSpiller's ability to control Block size. The
* resulting increase in Block size may cause failures and reduced
* performance.
*/
public void executeQuery(ReadRecordsRequest recordsRequest, final QueryStatusChecker queryStatusChecker, final BlockSpiller spiller) throws Exception {
logger.debug("readWithConstraint: enter - " + recordsRequest.getSplit());
long numRows = 0;
Client client = neptuneConnection.getNeptuneClientConnection();
GraphTraversalSource graphTraversalSource = neptuneConnection.getTraversalSource(client);
GraphTraversal graphTraversal = null;
String labelName = recordsRequest.getTableName().getTableName();
GeneratedRowWriter.RowWriterBuilder builder = GeneratedRowWriter.newBuilder(recordsRequest.getConstraints());
String type = recordsRequest.getSchema().getCustomMetadata().get("componenttype");
TableSchemaMetaType tableSchemaMetaType = TableSchemaMetaType.valueOf(type.toUpperCase());
logger.debug("readWithConstraint: schema type is " + tableSchemaMetaType.toString());
if (tableSchemaMetaType != null) {
switch(tableSchemaMetaType) {
case VERTEX:
graphTraversal = graphTraversalSource.V().hasLabel(labelName);
getQueryPartForContraintsMap(graphTraversal, recordsRequest);
graphTraversal = graphTraversal.valueMap().with(WithOptions.tokens);
for (final Field nextField : recordsRequest.getSchema().getFields()) {
VertexRowWriter.writeRowTemplate(builder, nextField);
}
break;
case EDGE:
graphTraversal = graphTraversalSource.E().hasLabel(labelName);
getQueryPartForContraintsMap(graphTraversal, recordsRequest);
graphTraversal = graphTraversal.elementMap();
for (final Field nextField : recordsRequest.getSchema().getFields()) {
EdgeRowWriter.writeRowTemplate(builder, nextField);
}
break;
}
}
// log string equivalent of gremlin query
logger.debug("readWithConstraint: enter - " + GroovyTranslator.of("g").translate(graphTraversal.asAdmin().getBytecode()));
final GraphTraversal graphTraversalFinal1 = graphTraversal;
final GeneratedRowWriter rowWriter1 = builder.build();
while (graphTraversalFinal1.hasNext() && queryStatusChecker.isQueryRunning()) {
numRows++;
spiller.writeRows((final Block block, final int rowNum) -> {
final Map obj = (Map) graphTraversalFinal1.next();
return (rowWriter1.writeRow(block, rowNum, (Object) obj) ? 1 : 0);
});
}
logger.info("readWithConstraint: numRows[{}]", numRows);
}
Aggregations