use of org.apache.gobblin.source.extractor.exception.SchemaException in project incubator-gobblin by apache.
the class RestApiExtractor method extractMetadata.
@Override
public void extractMetadata(String schema, String entity, WorkUnit workUnit) throws SchemaException {
log.info("Extract Metadata using Rest Api");
JsonArray columnArray = new JsonArray();
String inputQuery = workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_QUERY);
List<String> columnListInQuery = null;
JsonArray array = null;
if (!Strings.isNullOrEmpty(inputQuery)) {
columnListInQuery = Utils.getColumnListFromQuery(inputQuery);
}
String excludedColumns = workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_EXCLUDED_COLUMNS);
List<String> columnListExcluded = ImmutableList.<String>of();
if (Strings.isNullOrEmpty(inputQuery) && !Strings.isNullOrEmpty(excludedColumns)) {
Splitter splitter = Splitter.on(",").omitEmptyStrings().trimResults();
columnListExcluded = splitter.splitToList(excludedColumns.toLowerCase());
}
try {
boolean success = this.connector.connect();
if (!success) {
throw new SchemaException("Failed to connect.");
}
log.debug("Connected successfully.");
List<Command> cmds = this.getSchemaMetadata(schema, entity);
CommandOutput<?, ?> response = this.connector.getResponse(cmds);
array = this.getSchema(response);
for (JsonElement columnElement : array) {
Schema obj = GSON.fromJson(columnElement, Schema.class);
String columnName = obj.getColumnName();
obj.setWaterMark(this.isWatermarkColumn(workUnitState.getProp("extract.delta.fields"), columnName));
if (this.isWatermarkColumn(workUnitState.getProp("extract.delta.fields"), columnName)) {
obj.setNullable(false);
} else if (this.getPrimarykeyIndex(workUnitState.getProp("extract.primary.key.fields"), columnName) == 0) {
// set all columns as nullable except primary key and watermark columns
obj.setNullable(true);
}
obj.setPrimaryKey(this.getPrimarykeyIndex(workUnitState.getProp("extract.primary.key.fields"), columnName));
String jsonStr = GSON.toJson(obj);
JsonObject jsonObject = GSON.fromJson(jsonStr, JsonObject.class).getAsJsonObject();
// Else, consider only the columns mentioned in the column list
if (inputQuery == null || columnListInQuery == null || (columnListInQuery.size() == 1 && columnListInQuery.get(0).equals("*")) || (columnListInQuery.size() >= 1 && this.isMetadataColumn(columnName, columnListInQuery))) {
if (!columnListExcluded.contains(columnName.trim().toLowerCase())) {
this.columnList.add(columnName);
columnArray.add(jsonObject);
}
}
}
this.updatedQuery = buildDataQuery(inputQuery, entity);
log.info("Schema:" + columnArray);
this.setOutputSchema(columnArray);
} catch (RuntimeException | RestApiConnectionException | RestApiProcessingException | IOException | SchemaException e) {
throw new SchemaException("Failed to get schema using rest api; error - " + e.getMessage(), e);
}
}
use of org.apache.gobblin.source.extractor.exception.SchemaException in project incubator-gobblin by apache.
the class JdbcExtractor method getSchema.
@Override
public JsonArray getSchema(CommandOutput<?, ?> response) throws SchemaException, IOException {
this.log.debug("Extract schema from resultset");
ResultSet resultset = null;
Iterator<ResultSet> itr = (Iterator<ResultSet>) response.getResults().values().iterator();
if (itr.hasNext()) {
resultset = itr.next();
} else {
throw new SchemaException("Failed to get schema from database - Resultset has no records");
}
JsonArray fieldJsonArray = new JsonArray();
try {
while (resultset.next()) {
Schema schema = new Schema();
String columnName = resultset.getString(1);
schema.setColumnName(columnName);
String dataType = resultset.getString(2);
String elementDataType = "string";
List<String> mapSymbols = null;
JsonObject newDataType = this.convertDataType(columnName, dataType, elementDataType, mapSymbols);
schema.setDataType(newDataType);
schema.setLength(resultset.getLong(3));
schema.setPrecision(resultset.getInt(4));
schema.setScale(resultset.getInt(5));
schema.setNullable(resultset.getBoolean(6));
schema.setFormat(resultset.getString(7));
schema.setComment(resultset.getString(8));
schema.setDefaultValue(null);
schema.setUnique(false);
String jsonStr = gson.toJson(schema);
JsonObject obj = gson.fromJson(jsonStr, JsonObject.class).getAsJsonObject();
fieldJsonArray.add(obj);
}
} catch (Exception e) {
throw new SchemaException("Failed to get schema from database; error - " + e.getMessage(), e);
}
return fieldJsonArray;
}
use of org.apache.gobblin.source.extractor.exception.SchemaException in project incubator-gobblin by apache.
the class JdbcExtractor method extractMetadata.
@Override
public void extractMetadata(String schema, String entity, WorkUnit workUnit) throws SchemaException, IOException {
this.log.info("Extract metadata using JDBC");
String inputQuery = workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_QUERY);
if (hasJoinOperation(inputQuery)) {
throw new RuntimeException("Query across multiple tables not supported");
}
String watermarkColumn = workUnitState.getProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY);
this.enableDelimitedIdentifier = workUnitState.getPropAsBoolean(ConfigurationKeys.ENABLE_DELIMITED_IDENTIFIER, ConfigurationKeys.DEFAULT_ENABLE_DELIMITED_IDENTIFIER);
JsonObject defaultWatermark = this.getDefaultWatermark();
String derivedWatermarkColumnName = defaultWatermark.get("columnName").getAsString();
this.setSampleRecordCount(this.exractSampleRecordCountFromQuery(inputQuery));
inputQuery = this.removeSampleClauseFromQuery(inputQuery);
JsonArray targetSchema = new JsonArray();
List<String> headerColumns = new ArrayList<>();
try {
List<Command> cmds = this.getSchemaMetadata(schema, entity);
CommandOutput<?, ?> response = this.executePreparedSql(cmds);
JsonArray array = this.getSchema(response);
this.buildMetadataColumnMap(array);
this.parseInputQuery(inputQuery);
List<String> sourceColumns = this.getMetadataColumnList();
for (ColumnAttributes colMap : this.columnAliasMap) {
String alias = colMap.getAliasName();
String columnName = colMap.getColumnName();
String sourceColumnName = colMap.getSourceColumnName();
if (this.isMetadataColumn(columnName, sourceColumns)) {
String targetColumnName = this.getTargetColumnName(columnName, alias);
Schema obj = this.getUpdatedSchemaObject(columnName, alias, targetColumnName);
String jsonStr = gson.toJson(obj);
JsonObject jsonObject = gson.fromJson(jsonStr, JsonObject.class).getAsJsonObject();
targetSchema.add(jsonObject);
headerColumns.add(targetColumnName);
sourceColumnName = getLeftDelimitedIdentifier() + sourceColumnName + getRightDelimitedIdentifier();
this.columnList.add(sourceColumnName);
}
}
if (this.hasMultipleWatermarkColumns(watermarkColumn)) {
derivedWatermarkColumnName = getLeftDelimitedIdentifier() + derivedWatermarkColumnName + getRightDelimitedIdentifier();
this.columnList.add(derivedWatermarkColumnName);
headerColumns.add(derivedWatermarkColumnName);
targetSchema.add(defaultWatermark);
this.workUnitState.setProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY, derivedWatermarkColumnName);
}
String outputColProjection = Joiner.on(",").useForNull("null").join(this.columnList);
outputColProjection = outputColProjection.replace(derivedWatermarkColumnName, Utils.getCoalesceColumnNames(watermarkColumn) + " AS " + derivedWatermarkColumnName);
this.setOutputColumnProjection(outputColProjection);
String extractQuery = this.getExtractQuery(schema, entity, inputQuery);
this.setHeaderRecord(headerColumns);
this.setOutputSchema(targetSchema);
this.setExtractSql(extractQuery);
// this.workUnit.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY,
// this.escapeCharsInColumnName(this.workUnit.getProp(ConfigurationKeys.SOURCE_ENTITY),
// ConfigurationKeys.ESCAPE_CHARS_IN_COLUMN_NAME, "_"));
this.log.info("Schema:" + targetSchema);
this.log.info("Extract query: " + this.getExtractSql());
} catch (RuntimeException | IOException | SchemaException e) {
throw new SchemaException("Failed to get metadata using JDBC; error - " + e.getMessage(), e);
}
}
use of org.apache.gobblin.source.extractor.exception.SchemaException in project incubator-gobblin by apache.
the class SalesforceExtractor method getSchema.
@Override
public JsonArray getSchema(CommandOutput<?, ?> response) throws SchemaException {
log.info("Get schema from salesforce");
String output;
Iterator<String> itr = (Iterator<String>) response.getResults().values().iterator();
if (itr.hasNext()) {
output = itr.next();
} else {
throw new SchemaException("Failed to get schema from salesforce; REST response has no output");
}
JsonArray fieldJsonArray = new JsonArray();
JsonElement element = GSON.fromJson(output, JsonObject.class);
JsonObject jsonObject = element.getAsJsonObject();
try {
JsonArray array = jsonObject.getAsJsonArray("fields");
for (JsonElement columnElement : array) {
JsonObject field = columnElement.getAsJsonObject();
Schema schema = new Schema();
schema.setColumnName(field.get("name").getAsString());
String dataType = field.get("type").getAsString();
String elementDataType = "string";
List<String> mapSymbols = null;
JsonObject newDataType = this.convertDataType(field.get("name").getAsString(), dataType, elementDataType, mapSymbols);
log.debug("ColumnName:" + field.get("name").getAsString() + "; old datatype:" + dataType + "; new datatype:" + newDataType);
schema.setDataType(newDataType);
schema.setLength(field.get("length").getAsLong());
schema.setPrecision(field.get("precision").getAsInt());
schema.setScale(field.get("scale").getAsInt());
schema.setNullable(field.get("nillable").getAsBoolean());
schema.setFormat(null);
schema.setComment((field.get("label").isJsonNull() ? null : field.get("label").getAsString()));
schema.setDefaultValue((field.get("defaultValue").isJsonNull() ? null : field.get("defaultValue").getAsString()));
schema.setUnique(field.get("unique").getAsBoolean());
String jsonStr = GSON.toJson(schema);
JsonObject obj = GSON.fromJson(jsonStr, JsonObject.class).getAsJsonObject();
fieldJsonArray.add(obj);
}
} catch (Exception e) {
throw new SchemaException("Failed to get schema from salesforce; error - " + e.getMessage(), e);
}
return fieldJsonArray;
}
use of org.apache.gobblin.source.extractor.exception.SchemaException in project incubator-gobblin by apache.
the class QueryBasedExtractor method build.
/**
* build schema, record count and high water mark
*/
public Extractor<S, D> build() throws ExtractPrepareException {
String watermarkColumn = this.workUnitState.getProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY);
long lwm = partition.getLowWatermark();
long hwm = partition.getHighWatermark();
log.info("Low water mark: " + lwm + "; and High water mark: " + hwm);
WatermarkType watermarkType;
if (StringUtils.isBlank(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE))) {
watermarkType = null;
} else {
watermarkType = WatermarkType.valueOf(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE).toUpperCase());
}
log.info("Source Entity is " + this.entity);
try {
this.setTimeOut(this.workUnitState.getPropAsInt(ConfigurationKeys.SOURCE_CONN_TIMEOUT, ConfigurationKeys.DEFAULT_CONN_TIMEOUT));
this.extractMetadata(this.schema, this.entity, this.workUnit);
if (StringUtils.isNotBlank(watermarkColumn)) {
if (partition.isLastPartition()) {
// Get a more accurate high watermark from the source
long adjustedHighWatermark = this.getLatestWatermark(watermarkColumn, watermarkType, lwm, hwm);
log.info("High water mark from source: " + adjustedHighWatermark);
// Else, consider the low watermark as high water mark(with no delta).i.e, don't move the pointer
if (adjustedHighWatermark == ConfigurationKeys.DEFAULT_WATERMARK_VALUE) {
adjustedHighWatermark = getLowWatermarkWithNoDelta(lwm);
}
this.highWatermark = adjustedHighWatermark;
} else {
this.highWatermark = hwm;
}
log.info("High water mark for the current run: " + highWatermark);
this.setRangePredicates(watermarkColumn, watermarkType, lwm, highWatermark);
}
// if it is set to true, skip count calculation and set source count to -1
if (!Boolean.valueOf(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SKIP_COUNT_CALC))) {
this.sourceRecordCount = this.getSourceCount(this.schema, this.entity, this.workUnit, this.predicateList);
} else {
log.info("Skip count calculation");
this.sourceRecordCount = -1;
}
if (this.sourceRecordCount == 0) {
log.info("Record count is 0; Setting fetch status to false to skip readRecord()");
this.setFetchStatus(false);
}
} catch (SchemaException e) {
throw new ExtractPrepareException("Failed to get schema for this object; error - " + e.getMessage(), e);
} catch (HighWatermarkException e) {
throw new ExtractPrepareException("Failed to get high watermark; error - " + e.getMessage(), e);
} catch (RecordCountException e) {
throw new ExtractPrepareException("Failed to get record count; error - " + e.getMessage(), e);
} catch (Exception e) {
throw new ExtractPrepareException("Failed to prepare the extract build; error - " + e.getMessage(), e);
}
return this;
}
Aggregations