use of org.apache.gobblin.source.extractor.utils.InputStreamCSVReader in project incubator-gobblin by apache.
the class CsvToJsonConverter method convertRecord.
/**
* Takes in a record with format String and splits the data based on SOURCE_SCHEMA_DELIMITER
* Uses the inputSchema and the split record to convert the record to a JsonObject
* @return a JsonObject representing the record
* @throws DataConversionException
*/
@Override
public Iterable<JsonObject> convertRecord(JsonArray outputSchema, String inputRecord, WorkUnitState workUnit) throws DataConversionException {
try {
String strDelimiter = workUnit.getProp(ConfigurationKeys.CONVERTER_CSV_TO_JSON_DELIMITER);
if (Strings.isNullOrEmpty(strDelimiter)) {
throw new IllegalArgumentException("Delimiter cannot be empty");
}
InputStreamCSVReader reader = new InputStreamCSVReader(inputRecord, strDelimiter.charAt(0), workUnit.getProp(ConfigurationKeys.CONVERTER_CSV_TO_JSON_ENCLOSEDCHAR, ConfigurationKeys.DEFAULT_CONVERTER_CSV_TO_JSON_ENCLOSEDCHAR).charAt(0));
List<String> recordSplit;
recordSplit = Lists.newArrayList(reader.splitRecord());
JsonObject outputRecord = new JsonObject();
for (int i = 0; i < outputSchema.size(); i++) {
if (i < recordSplit.size()) {
if (recordSplit.get(i) == null) {
outputRecord.add(outputSchema.get(i).getAsJsonObject().get("columnName").getAsString(), JsonNull.INSTANCE);
} else if (recordSplit.get(i).isEmpty() || recordSplit.get(i).toLowerCase().equals(NULL)) {
outputRecord.add(outputSchema.get(i).getAsJsonObject().get("columnName").getAsString(), JsonNull.INSTANCE);
} else {
outputRecord.addProperty(outputSchema.get(i).getAsJsonObject().get("columnName").getAsString(), recordSplit.get(i));
}
} else {
outputRecord.add(outputSchema.get(i).getAsJsonObject().get("columnName").getAsString(), JsonNull.INSTANCE);
}
}
return new SingleRecordIterable<>(outputRecord);
} catch (Exception e) {
throw new DataConversionException(e);
}
}
use of org.apache.gobblin.source.extractor.utils.InputStreamCSVReader in project incubator-gobblin by apache.
the class SalesforceExtractor method reinitializeBufferedReader.
/**
* Reinitialize the state of {@link #bulkBufferedReader} to handle network disconnects
* @throws IOException
* @throws AsyncApiException
*/
private void reinitializeBufferedReader() throws IOException, AsyncApiException {
// close reader and get a new input stream to reconnect to resolve intermittent network errors
this.bulkBufferedReader.close();
this.bulkBufferedReader = getBulkBufferedReader(this.bulkResultIdCount - 1);
// if the result set is partially processed then we need to skip over processed records
if (!isNewBulkResultSet()) {
List<String> lastCsvRecord = null;
InputStreamCSVReader reader = new InputStreamCSVReader(this.bulkBufferedReader);
// skip header
reader.nextRecord();
int recordsToSkip = this.bulkRecordCount - this.prevBulkRecordCount;
log.info("Skipping {} records on retry: ", recordsToSkip);
for (int i = 0; i < recordsToSkip; i++) {
lastCsvRecord = reader.nextRecord();
}
// unprocessed record is processed in the next call to fetchResultBatch()
if (recordsToSkip > 0) {
if (!this.csvRecord.equals(lastCsvRecord)) {
throw new RuntimeException("Repositioning after reconnecting did not point to the expected record");
}
}
}
}
use of org.apache.gobblin.source.extractor.utils.InputStreamCSVReader in project incubator-gobblin by apache.
the class SalesforceExtractor method fetchResultBatch.
/**
* Fetch records into a {@link RecordSetList} up to the configured batch size {@link #batchSize}. This batch is not
* the entire Salesforce result batch. It is an internal batch in the extractor for buffering a subset of the result
* stream that comes from a Salesforce batch for more efficient processing.
* @param rs the record set to fetch into
* @param initialRecordCount Initial record count to use. This should correspond to the number of records already in rs.
* This is used to limit the number of records returned in rs to {@link #batchSize}.
* @throws DataRecordException
* @throws IOException
*/
private void fetchResultBatch(RecordSetList<JsonElement> rs, int initialRecordCount) throws DataRecordException, IOException {
int recordCount = initialRecordCount;
// Stream the resultset through CSV reader to identify columns in each record
InputStreamCSVReader reader = new InputStreamCSVReader(this.bulkBufferedReader);
// Get header if it is first run of a new resultset
if (this.isNewBulkResultSet()) {
this.bulkRecordHeader = reader.nextRecord();
this.bulkResultColumCount = this.bulkRecordHeader.size();
this.setNewBulkResultSet(false);
}
// Get record from CSV reader stream
while ((this.csvRecord = reader.nextRecord()) != null) {
// Convert CSV record to JsonObject
JsonObject jsonObject = Utils.csvToJsonObject(this.bulkRecordHeader, this.csvRecord, this.bulkResultColumCount);
rs.add(jsonObject);
recordCount++;
this.bulkRecordCount++;
// Insert records in record set until it reaches the batch size
if (recordCount >= batchSize) {
log.info("Total number of records processed so far: " + this.bulkRecordCount);
break;
}
}
}
Aggregations