Search in sources :

Example 1 with SchemaException

use of org.apache.gobblin.source.extractor.exception.SchemaException in project incubator-gobblin by apache.

the class RestApiExtractor method extractMetadata.

@Override
public void extractMetadata(String schema, String entity, WorkUnit workUnit) throws SchemaException {
    log.info("Extract Metadata using Rest Api");
    JsonArray columnArray = new JsonArray();
    String inputQuery = workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_QUERY);
    List<String> columnListInQuery = null;
    JsonArray array = null;
    if (!Strings.isNullOrEmpty(inputQuery)) {
        columnListInQuery = Utils.getColumnListFromQuery(inputQuery);
    }
    String excludedColumns = workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_EXCLUDED_COLUMNS);
    List<String> columnListExcluded = ImmutableList.<String>of();
    if (Strings.isNullOrEmpty(inputQuery) && !Strings.isNullOrEmpty(excludedColumns)) {
        Splitter splitter = Splitter.on(",").omitEmptyStrings().trimResults();
        columnListExcluded = splitter.splitToList(excludedColumns.toLowerCase());
    }
    try {
        boolean success = this.connector.connect();
        if (!success) {
            throw new SchemaException("Failed to connect.");
        }
        log.debug("Connected successfully.");
        List<Command> cmds = this.getSchemaMetadata(schema, entity);
        CommandOutput<?, ?> response = this.connector.getResponse(cmds);
        array = this.getSchema(response);
        for (JsonElement columnElement : array) {
            Schema obj = GSON.fromJson(columnElement, Schema.class);
            String columnName = obj.getColumnName();
            obj.setWaterMark(this.isWatermarkColumn(workUnitState.getProp("extract.delta.fields"), columnName));
            if (this.isWatermarkColumn(workUnitState.getProp("extract.delta.fields"), columnName)) {
                obj.setNullable(false);
            } else if (this.getPrimarykeyIndex(workUnitState.getProp("extract.primary.key.fields"), columnName) == 0) {
                // set all columns as nullable except primary key and watermark columns
                obj.setNullable(true);
            }
            obj.setPrimaryKey(this.getPrimarykeyIndex(workUnitState.getProp("extract.primary.key.fields"), columnName));
            String jsonStr = GSON.toJson(obj);
            JsonObject jsonObject = GSON.fromJson(jsonStr, JsonObject.class).getAsJsonObject();
            // Else, consider only the columns mentioned in the column list
            if (inputQuery == null || columnListInQuery == null || (columnListInQuery.size() == 1 && columnListInQuery.get(0).equals("*")) || (columnListInQuery.size() >= 1 && this.isMetadataColumn(columnName, columnListInQuery))) {
                if (!columnListExcluded.contains(columnName.trim().toLowerCase())) {
                    this.columnList.add(columnName);
                    columnArray.add(jsonObject);
                }
            }
        }
        this.updatedQuery = buildDataQuery(inputQuery, entity);
        log.info("Schema:" + columnArray);
        this.setOutputSchema(columnArray);
    } catch (RuntimeException | RestApiConnectionException | RestApiProcessingException | IOException | SchemaException e) {
        throw new SchemaException("Failed to get schema using rest api; error - " + e.getMessage(), e);
    }
}
Also used : SchemaException(org.apache.gobblin.source.extractor.exception.SchemaException) Splitter(com.google.common.base.Splitter) Schema(org.apache.gobblin.source.extractor.schema.Schema) JsonObject(com.google.gson.JsonObject) RestApiProcessingException(org.apache.gobblin.source.extractor.exception.RestApiProcessingException) IOException(java.io.IOException) RestApiConnectionException(org.apache.gobblin.source.extractor.exception.RestApiConnectionException) JsonArray(com.google.gson.JsonArray) Command(org.apache.gobblin.source.extractor.extract.Command) JsonElement(com.google.gson.JsonElement)

Example 2 with SchemaException

use of org.apache.gobblin.source.extractor.exception.SchemaException in project incubator-gobblin by apache.

the class JdbcExtractor method getSchema.

@Override
public JsonArray getSchema(CommandOutput<?, ?> response) throws SchemaException, IOException {
    this.log.debug("Extract schema from resultset");
    ResultSet resultset = null;
    Iterator<ResultSet> itr = (Iterator<ResultSet>) response.getResults().values().iterator();
    if (itr.hasNext()) {
        resultset = itr.next();
    } else {
        throw new SchemaException("Failed to get schema from database - Resultset has no records");
    }
    JsonArray fieldJsonArray = new JsonArray();
    try {
        while (resultset.next()) {
            Schema schema = new Schema();
            String columnName = resultset.getString(1);
            schema.setColumnName(columnName);
            String dataType = resultset.getString(2);
            String elementDataType = "string";
            List<String> mapSymbols = null;
            JsonObject newDataType = this.convertDataType(columnName, dataType, elementDataType, mapSymbols);
            schema.setDataType(newDataType);
            schema.setLength(resultset.getLong(3));
            schema.setPrecision(resultset.getInt(4));
            schema.setScale(resultset.getInt(5));
            schema.setNullable(resultset.getBoolean(6));
            schema.setFormat(resultset.getString(7));
            schema.setComment(resultset.getString(8));
            schema.setDefaultValue(null);
            schema.setUnique(false);
            String jsonStr = gson.toJson(schema);
            JsonObject obj = gson.fromJson(jsonStr, JsonObject.class).getAsJsonObject();
            fieldJsonArray.add(obj);
        }
    } catch (Exception e) {
        throw new SchemaException("Failed to get schema from database; error - " + e.getMessage(), e);
    }
    return fieldJsonArray;
}
Also used : JsonArray(com.google.gson.JsonArray) SchemaException(org.apache.gobblin.source.extractor.exception.SchemaException) Schema(org.apache.gobblin.source.extractor.schema.Schema) ResultSet(java.sql.ResultSet) Iterator(java.util.Iterator) JsonObject(com.google.gson.JsonObject) ParseException(java.text.ParseException) RecordCountException(org.apache.gobblin.source.extractor.exception.RecordCountException) DataRecordException(org.apache.gobblin.source.extractor.DataRecordException) SqlParseException(org.apache.calcite.sql.parser.SqlParseException) SchemaException(org.apache.gobblin.source.extractor.exception.SchemaException) SQLException(java.sql.SQLException) IOException(java.io.IOException) HighWatermarkException(org.apache.gobblin.source.extractor.exception.HighWatermarkException)

Example 3 with SchemaException

use of org.apache.gobblin.source.extractor.exception.SchemaException in project incubator-gobblin by apache.

the class JdbcExtractor method extractMetadata.

@Override
public void extractMetadata(String schema, String entity, WorkUnit workUnit) throws SchemaException, IOException {
    this.log.info("Extract metadata using JDBC");
    String inputQuery = workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_QUERY);
    if (hasJoinOperation(inputQuery)) {
        throw new RuntimeException("Query across multiple tables not supported");
    }
    String watermarkColumn = workUnitState.getProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY);
    this.enableDelimitedIdentifier = workUnitState.getPropAsBoolean(ConfigurationKeys.ENABLE_DELIMITED_IDENTIFIER, ConfigurationKeys.DEFAULT_ENABLE_DELIMITED_IDENTIFIER);
    JsonObject defaultWatermark = this.getDefaultWatermark();
    String derivedWatermarkColumnName = defaultWatermark.get("columnName").getAsString();
    this.setSampleRecordCount(this.exractSampleRecordCountFromQuery(inputQuery));
    inputQuery = this.removeSampleClauseFromQuery(inputQuery);
    JsonArray targetSchema = new JsonArray();
    List<String> headerColumns = new ArrayList<>();
    try {
        List<Command> cmds = this.getSchemaMetadata(schema, entity);
        CommandOutput<?, ?> response = this.executePreparedSql(cmds);
        JsonArray array = this.getSchema(response);
        this.buildMetadataColumnMap(array);
        this.parseInputQuery(inputQuery);
        List<String> sourceColumns = this.getMetadataColumnList();
        for (ColumnAttributes colMap : this.columnAliasMap) {
            String alias = colMap.getAliasName();
            String columnName = colMap.getColumnName();
            String sourceColumnName = colMap.getSourceColumnName();
            if (this.isMetadataColumn(columnName, sourceColumns)) {
                String targetColumnName = this.getTargetColumnName(columnName, alias);
                Schema obj = this.getUpdatedSchemaObject(columnName, alias, targetColumnName);
                String jsonStr = gson.toJson(obj);
                JsonObject jsonObject = gson.fromJson(jsonStr, JsonObject.class).getAsJsonObject();
                targetSchema.add(jsonObject);
                headerColumns.add(targetColumnName);
                sourceColumnName = getLeftDelimitedIdentifier() + sourceColumnName + getRightDelimitedIdentifier();
                this.columnList.add(sourceColumnName);
            }
        }
        if (this.hasMultipleWatermarkColumns(watermarkColumn)) {
            derivedWatermarkColumnName = getLeftDelimitedIdentifier() + derivedWatermarkColumnName + getRightDelimitedIdentifier();
            this.columnList.add(derivedWatermarkColumnName);
            headerColumns.add(derivedWatermarkColumnName);
            targetSchema.add(defaultWatermark);
            this.workUnitState.setProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY, derivedWatermarkColumnName);
        }
        String outputColProjection = Joiner.on(",").useForNull("null").join(this.columnList);
        outputColProjection = outputColProjection.replace(derivedWatermarkColumnName, Utils.getCoalesceColumnNames(watermarkColumn) + " AS " + derivedWatermarkColumnName);
        this.setOutputColumnProjection(outputColProjection);
        String extractQuery = this.getExtractQuery(schema, entity, inputQuery);
        this.setHeaderRecord(headerColumns);
        this.setOutputSchema(targetSchema);
        this.setExtractSql(extractQuery);
        // this.workUnit.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY,
        // this.escapeCharsInColumnName(this.workUnit.getProp(ConfigurationKeys.SOURCE_ENTITY),
        // ConfigurationKeys.ESCAPE_CHARS_IN_COLUMN_NAME, "_"));
        this.log.info("Schema:" + targetSchema);
        this.log.info("Extract query: " + this.getExtractSql());
    } catch (RuntimeException | IOException | SchemaException e) {
        throw new SchemaException("Failed to get metadata using JDBC; error - " + e.getMessage(), e);
    }
}
Also used : SchemaException(org.apache.gobblin.source.extractor.exception.SchemaException) ColumnAttributes(org.apache.gobblin.source.extractor.schema.ColumnAttributes) Schema(org.apache.gobblin.source.extractor.schema.Schema) ArrayList(java.util.ArrayList) JsonObject(com.google.gson.JsonObject) IOException(java.io.IOException) JsonArray(com.google.gson.JsonArray) Command(org.apache.gobblin.source.extractor.extract.Command)

Example 4 with SchemaException

use of org.apache.gobblin.source.extractor.exception.SchemaException in project incubator-gobblin by apache.

the class SalesforceExtractor method getSchema.

@Override
public JsonArray getSchema(CommandOutput<?, ?> response) throws SchemaException {
    log.info("Get schema from salesforce");
    String output;
    Iterator<String> itr = (Iterator<String>) response.getResults().values().iterator();
    if (itr.hasNext()) {
        output = itr.next();
    } else {
        throw new SchemaException("Failed to get schema from salesforce; REST response has no output");
    }
    JsonArray fieldJsonArray = new JsonArray();
    JsonElement element = GSON.fromJson(output, JsonObject.class);
    JsonObject jsonObject = element.getAsJsonObject();
    try {
        JsonArray array = jsonObject.getAsJsonArray("fields");
        for (JsonElement columnElement : array) {
            JsonObject field = columnElement.getAsJsonObject();
            Schema schema = new Schema();
            schema.setColumnName(field.get("name").getAsString());
            String dataType = field.get("type").getAsString();
            String elementDataType = "string";
            List<String> mapSymbols = null;
            JsonObject newDataType = this.convertDataType(field.get("name").getAsString(), dataType, elementDataType, mapSymbols);
            log.debug("ColumnName:" + field.get("name").getAsString() + ";   old datatype:" + dataType + ";   new datatype:" + newDataType);
            schema.setDataType(newDataType);
            schema.setLength(field.get("length").getAsLong());
            schema.setPrecision(field.get("precision").getAsInt());
            schema.setScale(field.get("scale").getAsInt());
            schema.setNullable(field.get("nillable").getAsBoolean());
            schema.setFormat(null);
            schema.setComment((field.get("label").isJsonNull() ? null : field.get("label").getAsString()));
            schema.setDefaultValue((field.get("defaultValue").isJsonNull() ? null : field.get("defaultValue").getAsString()));
            schema.setUnique(field.get("unique").getAsBoolean());
            String jsonStr = GSON.toJson(schema);
            JsonObject obj = GSON.fromJson(jsonStr, JsonObject.class).getAsJsonObject();
            fieldJsonArray.add(obj);
        }
    } catch (Exception e) {
        throw new SchemaException("Failed to get schema from salesforce; error - " + e.getMessage(), e);
    }
    return fieldJsonArray;
}
Also used : JsonArray(com.google.gson.JsonArray) SchemaException(org.apache.gobblin.source.extractor.exception.SchemaException) JsonElement(com.google.gson.JsonElement) Schema(org.apache.gobblin.source.extractor.schema.Schema) ListIterator(java.util.ListIterator) Iterator(java.util.Iterator) JsonObject(com.google.gson.JsonObject) ParseException(java.text.ParseException) RecordCountException(org.apache.gobblin.source.extractor.exception.RecordCountException) AsyncApiException(com.sforce.async.AsyncApiException) DataRecordException(org.apache.gobblin.source.extractor.DataRecordException) SchemaException(org.apache.gobblin.source.extractor.exception.SchemaException) RestApiClientException(org.apache.gobblin.source.extractor.exception.RestApiClientException) IOException(java.io.IOException) HighWatermarkException(org.apache.gobblin.source.extractor.exception.HighWatermarkException) RestApiConnectionException(org.apache.gobblin.source.extractor.exception.RestApiConnectionException)

Example 5 with SchemaException

use of org.apache.gobblin.source.extractor.exception.SchemaException in project incubator-gobblin by apache.

the class QueryBasedExtractor method build.

/**
 * build schema, record count and high water mark
 */
public Extractor<S, D> build() throws ExtractPrepareException {
    String watermarkColumn = this.workUnitState.getProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY);
    long lwm = partition.getLowWatermark();
    long hwm = partition.getHighWatermark();
    log.info("Low water mark: " + lwm + "; and High water mark: " + hwm);
    WatermarkType watermarkType;
    if (StringUtils.isBlank(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE))) {
        watermarkType = null;
    } else {
        watermarkType = WatermarkType.valueOf(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE).toUpperCase());
    }
    log.info("Source Entity is " + this.entity);
    try {
        this.setTimeOut(this.workUnitState.getPropAsInt(ConfigurationKeys.SOURCE_CONN_TIMEOUT, ConfigurationKeys.DEFAULT_CONN_TIMEOUT));
        this.extractMetadata(this.schema, this.entity, this.workUnit);
        if (StringUtils.isNotBlank(watermarkColumn)) {
            if (partition.isLastPartition()) {
                // Get a more accurate high watermark from the source
                long adjustedHighWatermark = this.getLatestWatermark(watermarkColumn, watermarkType, lwm, hwm);
                log.info("High water mark from source: " + adjustedHighWatermark);
                // Else, consider the low watermark as high water mark(with no delta).i.e, don't move the pointer
                if (adjustedHighWatermark == ConfigurationKeys.DEFAULT_WATERMARK_VALUE) {
                    adjustedHighWatermark = getLowWatermarkWithNoDelta(lwm);
                }
                this.highWatermark = adjustedHighWatermark;
            } else {
                this.highWatermark = hwm;
            }
            log.info("High water mark for the current run: " + highWatermark);
            this.setRangePredicates(watermarkColumn, watermarkType, lwm, highWatermark);
        }
        // if it is set to true, skip count calculation and set source count to -1
        if (!Boolean.valueOf(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SKIP_COUNT_CALC))) {
            this.sourceRecordCount = this.getSourceCount(this.schema, this.entity, this.workUnit, this.predicateList);
        } else {
            log.info("Skip count calculation");
            this.sourceRecordCount = -1;
        }
        if (this.sourceRecordCount == 0) {
            log.info("Record count is 0; Setting fetch status to false to skip readRecord()");
            this.setFetchStatus(false);
        }
    } catch (SchemaException e) {
        throw new ExtractPrepareException("Failed to get schema for this object; error - " + e.getMessage(), e);
    } catch (HighWatermarkException e) {
        throw new ExtractPrepareException("Failed to get high watermark; error - " + e.getMessage(), e);
    } catch (RecordCountException e) {
        throw new ExtractPrepareException("Failed to get record count; error - " + e.getMessage(), e);
    } catch (Exception e) {
        throw new ExtractPrepareException("Failed to prepare the extract build; error - " + e.getMessage(), e);
    }
    return this;
}
Also used : SchemaException(org.apache.gobblin.source.extractor.exception.SchemaException) RecordCountException(org.apache.gobblin.source.extractor.exception.RecordCountException) WatermarkType(org.apache.gobblin.source.extractor.watermark.WatermarkType) ExtractPrepareException(org.apache.gobblin.source.extractor.exception.ExtractPrepareException) HighWatermarkException(org.apache.gobblin.source.extractor.exception.HighWatermarkException) SchemaException(org.apache.gobblin.source.extractor.exception.SchemaException) IOException(java.io.IOException) HighWatermarkException(org.apache.gobblin.source.extractor.exception.HighWatermarkException) RecordCountException(org.apache.gobblin.source.extractor.exception.RecordCountException) ExtractPrepareException(org.apache.gobblin.source.extractor.exception.ExtractPrepareException) DataRecordException(org.apache.gobblin.source.extractor.DataRecordException)

Aggregations

IOException (java.io.IOException)6 SchemaException (org.apache.gobblin.source.extractor.exception.SchemaException)6 JsonArray (com.google.gson.JsonArray)5 JsonObject (com.google.gson.JsonObject)5 Schema (org.apache.gobblin.source.extractor.schema.Schema)5 DataRecordException (org.apache.gobblin.source.extractor.DataRecordException)4 HighWatermarkException (org.apache.gobblin.source.extractor.exception.HighWatermarkException)4 RecordCountException (org.apache.gobblin.source.extractor.exception.RecordCountException)4 Iterator (java.util.Iterator)3 JsonElement (com.google.gson.JsonElement)2 ResultSet (java.sql.ResultSet)2 ParseException (java.text.ParseException)2 RestApiConnectionException (org.apache.gobblin.source.extractor.exception.RestApiConnectionException)2 Command (org.apache.gobblin.source.extractor.extract.Command)2 Splitter (com.google.common.base.Splitter)1 AsyncApiException (com.sforce.async.AsyncApiException)1 ResultSetMetaData (java.sql.ResultSetMetaData)1 SQLException (java.sql.SQLException)1 ArrayList (java.util.ArrayList)1 ListIterator (java.util.ListIterator)1