Search in sources :

Example 1 with InputStreamCSVReader

use of org.apache.gobblin.source.extractor.utils.InputStreamCSVReader in project incubator-gobblin by apache.

the class CsvToJsonConverter method convertRecord.

/**
 * Takes in a record with format String and splits the data based on SOURCE_SCHEMA_DELIMITER
 * Uses the inputSchema and the split record to convert the record to a JsonObject
 * @return a JsonObject representing the record
 * @throws DataConversionException
 */
@Override
public Iterable<JsonObject> convertRecord(JsonArray outputSchema, String inputRecord, WorkUnitState workUnit) throws DataConversionException {
    try {
        String strDelimiter = workUnit.getProp(ConfigurationKeys.CONVERTER_CSV_TO_JSON_DELIMITER);
        if (Strings.isNullOrEmpty(strDelimiter)) {
            throw new IllegalArgumentException("Delimiter cannot be empty");
        }
        InputStreamCSVReader reader = new InputStreamCSVReader(inputRecord, strDelimiter.charAt(0), workUnit.getProp(ConfigurationKeys.CONVERTER_CSV_TO_JSON_ENCLOSEDCHAR, ConfigurationKeys.DEFAULT_CONVERTER_CSV_TO_JSON_ENCLOSEDCHAR).charAt(0));
        List<String> recordSplit;
        recordSplit = Lists.newArrayList(reader.splitRecord());
        JsonObject outputRecord = new JsonObject();
        for (int i = 0; i < outputSchema.size(); i++) {
            if (i < recordSplit.size()) {
                if (recordSplit.get(i) == null) {
                    outputRecord.add(outputSchema.get(i).getAsJsonObject().get("columnName").getAsString(), JsonNull.INSTANCE);
                } else if (recordSplit.get(i).isEmpty() || recordSplit.get(i).toLowerCase().equals(NULL)) {
                    outputRecord.add(outputSchema.get(i).getAsJsonObject().get("columnName").getAsString(), JsonNull.INSTANCE);
                } else {
                    outputRecord.addProperty(outputSchema.get(i).getAsJsonObject().get("columnName").getAsString(), recordSplit.get(i));
                }
            } else {
                outputRecord.add(outputSchema.get(i).getAsJsonObject().get("columnName").getAsString(), JsonNull.INSTANCE);
            }
        }
        return new SingleRecordIterable<>(outputRecord);
    } catch (Exception e) {
        throw new DataConversionException(e);
    }
}
Also used : SingleRecordIterable(org.apache.gobblin.converter.SingleRecordIterable) JsonObject(com.google.gson.JsonObject) InputStreamCSVReader(org.apache.gobblin.source.extractor.utils.InputStreamCSVReader) DataConversionException(org.apache.gobblin.converter.DataConversionException) IOException(java.io.IOException) SchemaConversionException(org.apache.gobblin.converter.SchemaConversionException) DataConversionException(org.apache.gobblin.converter.DataConversionException)

Example 2 with InputStreamCSVReader

use of org.apache.gobblin.source.extractor.utils.InputStreamCSVReader in project incubator-gobblin by apache.

the class SalesforceExtractor method reinitializeBufferedReader.

/**
 * Reinitialize the state of {@link #bulkBufferedReader} to handle network disconnects
 * @throws IOException
 * @throws AsyncApiException
 */
private void reinitializeBufferedReader() throws IOException, AsyncApiException {
    // close reader and get a new input stream to reconnect to resolve intermittent network errors
    this.bulkBufferedReader.close();
    this.bulkBufferedReader = getBulkBufferedReader(this.bulkResultIdCount - 1);
    // if the result set is partially processed then we need to skip over processed records
    if (!isNewBulkResultSet()) {
        List<String> lastCsvRecord = null;
        InputStreamCSVReader reader = new InputStreamCSVReader(this.bulkBufferedReader);
        // skip header
        reader.nextRecord();
        int recordsToSkip = this.bulkRecordCount - this.prevBulkRecordCount;
        log.info("Skipping {} records on retry: ", recordsToSkip);
        for (int i = 0; i < recordsToSkip; i++) {
            lastCsvRecord = reader.nextRecord();
        }
        // unprocessed record is processed in the next call to fetchResultBatch()
        if (recordsToSkip > 0) {
            if (!this.csvRecord.equals(lastCsvRecord)) {
                throw new RuntimeException("Repositioning after reconnecting did not point to the expected record");
            }
        }
    }
}
Also used : InputStreamCSVReader(org.apache.gobblin.source.extractor.utils.InputStreamCSVReader)

Example 3 with InputStreamCSVReader

use of org.apache.gobblin.source.extractor.utils.InputStreamCSVReader in project incubator-gobblin by apache.

the class SalesforceExtractor method fetchResultBatch.

/**
 * Fetch records into a {@link RecordSetList} up to the configured batch size {@link #batchSize}. This batch is not
 * the entire Salesforce result batch. It is an internal batch in the extractor for buffering a subset of the result
 * stream that comes from a Salesforce batch for more efficient processing.
 * @param rs the record set to fetch into
 * @param initialRecordCount Initial record count to use. This should correspond to the number of records already in rs.
 *                           This is used to limit the number of records returned in rs to {@link #batchSize}.
 * @throws DataRecordException
 * @throws IOException
 */
private void fetchResultBatch(RecordSetList<JsonElement> rs, int initialRecordCount) throws DataRecordException, IOException {
    int recordCount = initialRecordCount;
    // Stream the resultset through CSV reader to identify columns in each record
    InputStreamCSVReader reader = new InputStreamCSVReader(this.bulkBufferedReader);
    // Get header if it is first run of a new resultset
    if (this.isNewBulkResultSet()) {
        this.bulkRecordHeader = reader.nextRecord();
        this.bulkResultColumCount = this.bulkRecordHeader.size();
        this.setNewBulkResultSet(false);
    }
    // Get record from CSV reader stream
    while ((this.csvRecord = reader.nextRecord()) != null) {
        // Convert CSV record to JsonObject
        JsonObject jsonObject = Utils.csvToJsonObject(this.bulkRecordHeader, this.csvRecord, this.bulkResultColumCount);
        rs.add(jsonObject);
        recordCount++;
        this.bulkRecordCount++;
        // Insert records in record set until it reaches the batch size
        if (recordCount >= batchSize) {
            log.info("Total number of records processed so far: " + this.bulkRecordCount);
            break;
        }
    }
}
Also used : JsonObject(com.google.gson.JsonObject) InputStreamCSVReader(org.apache.gobblin.source.extractor.utils.InputStreamCSVReader)

Aggregations

InputStreamCSVReader (org.apache.gobblin.source.extractor.utils.InputStreamCSVReader)3 JsonObject (com.google.gson.JsonObject)2 IOException (java.io.IOException)1 DataConversionException (org.apache.gobblin.converter.DataConversionException)1 SchemaConversionException (org.apache.gobblin.converter.SchemaConversionException)1 SingleRecordIterable (org.apache.gobblin.converter.SingleRecordIterable)1