use of io.cdap.cdap.etl.api.join.error.JoinError in project cdap by caskdata.
the class JoinDefinitionTest method testBadOutputSchema.
@Test
public void testBadOutputSchema() {
JoinStage purchases = JoinStage.builder("purchases", PURCHASE_SCHEMA).build();
JoinStage users = JoinStage.builder("users", USER_SCHEMA).isOptional().build();
/*
things wrong with the schema:
missing 'price' field
extra 'pricee' field
'coupon' should be nullable
'email' is the wrong type
*/
Schema badSchema = Schema.recordOf("joined", Schema.Field.of("ts", Schema.of(Schema.LogicalType.TIMESTAMP_MICROS)), Schema.Field.of("purchase_id", Schema.of(Schema.Type.LONG)), Schema.Field.of("user_id", Schema.nullableOf(Schema.of(Schema.Type.LONG))), Schema.Field.of("pricee", Schema.of(Schema.Type.DOUBLE)), Schema.Field.of("coupon", Schema.of(Schema.Type.BOOLEAN)), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("email", Schema.nullableOf(Schema.of(Schema.Type.FLOAT))), Schema.Field.of("age", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("bday", Schema.nullableOf(Schema.of(Schema.LogicalType.DATE))));
try {
JoinDefinition.builder().select(new JoinField("purchases", "id", "purchase_id"), new JoinField("users", "id", "user_id"), new JoinField("purchases", "ts"), new JoinField("purchases", "price"), new JoinField("purchases", "coupon"), new JoinField("users", "name"), new JoinField("users", "email"), new JoinField("users", "age"), new JoinField("users", "bday")).from(purchases, users).on(JoinCondition.onKeys().addKey(new JoinKey("purchases", Collections.singletonList("user_id"))).addKey(new JoinKey("users", Collections.singletonList("id"))).build()).setOutputSchema(badSchema).build();
Assert.fail("Invalid output schema did not fail as expected");
} catch (InvalidJoinException e) {
// expected
Collection<JoinError> errors = e.getErrors();
Assert.assertEquals(4, errors.size());
Map<String, String> expected = new HashMap<>();
expected.put("pricee", null);
expected.put("coupon", "boolean");
expected.put("email", "string");
Map<String, String> badFields = new HashMap<>();
for (JoinError joinError : errors) {
if (joinError.getType() != JoinError.Type.OUTPUT_SCHEMA) {
// this is the error about one of the selected fields missing from the output schema
Assert.assertEquals(JoinError.Type.GENERAL, joinError.getType());
continue;
}
OutputSchemaError outputSchemaError = (OutputSchemaError) joinError;
badFields.put(outputSchemaError.getField(), outputSchemaError.getExpectedType());
}
Assert.assertEquals(expected, badFields);
}
}
use of io.cdap.cdap.etl.api.join.error.JoinError in project cdap by caskdata.
the class JoinDistribution method validate.
public Collection<JoinError> validate(List<JoinStage> stages) {
List<JoinError> errors = new ArrayList<>();
if (stages.size() > 2) {
errors.add(new JoinError("Only two stages can be joined if a distribution factor is specified"));
}
if (skewedStageName == null) {
errors.add(new DistributionStageError("Distribution requires skewed stage name to be defined"));
}
if (distributionFactor < 1) {
errors.add(new DistributionSizeError("Distribution size must be greater than 0"));
}
// If skewedStageName does not match any of the names in stages
JoinStage leftStage = stages.stream().filter(s -> s.getStageName().equals(skewedStageName)).findFirst().orElse(null);
if (leftStage == null) {
errors.add(new DistributionStageError(String.format("Skewed stage '%s' does not match any of the specified " + "stages", skewedStageName)));
} else if (!leftStage.isRequired()) {
errors.add(new DistributionStageError(String.format("Distribution only supports inner or left outer joins, the skewed " + "stage '%s' must be required", skewedStageName)));
}
if (stages.stream().anyMatch(JoinStage::isBroadcast)) {
errors.add(new BroadcastError("Distribution cannot be used if either stage will be broadcast"));
}
return errors;
}
use of io.cdap.cdap.etl.api.join.error.JoinError in project cdap by cdapio.
the class JoinDistribution method validate.
public Collection<JoinError> validate(List<JoinStage> stages) {
List<JoinError> errors = new ArrayList<>();
if (stages.size() > 2) {
errors.add(new JoinError("Only two stages can be joined if a distribution factor is specified"));
}
if (skewedStageName == null) {
errors.add(new DistributionStageError("Distribution requires skewed stage name to be defined"));
}
if (distributionFactor < 1) {
errors.add(new DistributionSizeError("Distribution size must be greater than 0"));
}
// If skewedStageName does not match any of the names in stages
JoinStage leftStage = stages.stream().filter(s -> s.getStageName().equals(skewedStageName)).findFirst().orElse(null);
if (leftStage == null) {
errors.add(new DistributionStageError(String.format("Skewed stage '%s' does not match any of the specified " + "stages", skewedStageName)));
} else if (!leftStage.isRequired()) {
errors.add(new DistributionStageError(String.format("Distribution only supports inner or left outer joins, the skewed " + "stage '%s' must be required", skewedStageName)));
}
if (stages.stream().anyMatch(JoinStage::isBroadcast)) {
errors.add(new BroadcastError("Distribution cannot be used if either stage will be broadcast"));
}
return errors;
}
use of io.cdap.cdap.etl.api.join.error.JoinError in project cdap by caskdata.
the class InvalidJoinException method getMessage.
private static String getMessage(Collection<JoinError> errors) {
if (errors.isEmpty()) {
throw new IllegalStateException("An invalid join must contain at least one error, " + "or it must provide an error message.");
}
JoinError error = errors.iterator().next();
String message = error.getMessage();
return String.format("%s%s %s", message, message.endsWith(".") ? "" : ".", error.getCorrectiveAction());
}
use of io.cdap.cdap.etl.api.join.error.JoinError in project cdap by cdapio.
the class JoinDefinitionTest method testBadOutputSchema.
@Test
public void testBadOutputSchema() {
JoinStage purchases = JoinStage.builder("purchases", PURCHASE_SCHEMA).build();
JoinStage users = JoinStage.builder("users", USER_SCHEMA).isOptional().build();
/*
things wrong with the schema:
missing 'price' field
extra 'pricee' field
'coupon' should be nullable
'email' is the wrong type
*/
Schema badSchema = Schema.recordOf("joined", Schema.Field.of("ts", Schema.of(Schema.LogicalType.TIMESTAMP_MICROS)), Schema.Field.of("purchase_id", Schema.of(Schema.Type.LONG)), Schema.Field.of("user_id", Schema.nullableOf(Schema.of(Schema.Type.LONG))), Schema.Field.of("pricee", Schema.of(Schema.Type.DOUBLE)), Schema.Field.of("coupon", Schema.of(Schema.Type.BOOLEAN)), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("email", Schema.nullableOf(Schema.of(Schema.Type.FLOAT))), Schema.Field.of("age", Schema.nullableOf(Schema.of(Schema.Type.INT))), Schema.Field.of("bday", Schema.nullableOf(Schema.of(Schema.LogicalType.DATE))));
try {
JoinDefinition.builder().select(new JoinField("purchases", "id", "purchase_id"), new JoinField("users", "id", "user_id"), new JoinField("purchases", "ts"), new JoinField("purchases", "price"), new JoinField("purchases", "coupon"), new JoinField("users", "name"), new JoinField("users", "email"), new JoinField("users", "age"), new JoinField("users", "bday")).from(purchases, users).on(JoinCondition.onKeys().addKey(new JoinKey("purchases", Collections.singletonList("user_id"))).addKey(new JoinKey("users", Collections.singletonList("id"))).build()).setOutputSchema(badSchema).build();
Assert.fail("Invalid output schema did not fail as expected");
} catch (InvalidJoinException e) {
// expected
Collection<JoinError> errors = e.getErrors();
Assert.assertEquals(4, errors.size());
Map<String, String> expected = new HashMap<>();
expected.put("pricee", null);
expected.put("coupon", "boolean");
expected.put("email", "string");
Map<String, String> badFields = new HashMap<>();
for (JoinError joinError : errors) {
if (joinError.getType() != JoinError.Type.OUTPUT_SCHEMA) {
// this is the error about one of the selected fields missing from the output schema
Assert.assertEquals(JoinError.Type.GENERAL, joinError.getType());
continue;
}
OutputSchemaError outputSchemaError = (OutputSchemaError) joinError;
badFields.put(outputSchemaError.getField(), outputSchemaError.getExpectedType());
}
Assert.assertEquals(expected, badFields);
}
}
Aggregations