use of org.apache.gobblin.source.extractor.DataRecordException in project incubator-gobblin by apache.
the class QueryBasedExtractor method readRecord.
@Override
public D readRecord(@Deprecated D reuse) throws DataRecordException, IOException {
if (!this.isPullRequired()) {
log.info("No more records to read");
return null;
}
D nextElement = null;
try {
if (isInitialPull()) {
log.info("Initial pull");
if (shouldRemoveDataPullUpperBounds()) {
this.removeDataPullUpperBounds();
}
this.iterator = this.getIterator();
}
if (this.iterator.hasNext()) {
nextElement = this.iterator.next();
if (!this.iterator.hasNext()) {
log.debug("Getting next pull");
this.iterator = this.getIterator();
if (this.iterator == null) {
this.setFetchStatus(false);
}
}
}
} catch (Exception e) {
throw new DataRecordException("Failed to get records using rest api; error - " + e.getMessage(), e);
}
return nextElement;
}
use of org.apache.gobblin.source.extractor.DataRecordException in project incubator-gobblin by apache.
the class KafkaExtractor method readRecordImpl.
/**
* Return the next decodable record from the current partition. If the current partition has no more
* decodable record, move on to the next partition. If all partitions have been processed, return null.
*/
@SuppressWarnings("unchecked")
@Override
public D readRecordImpl(D reuse) throws DataRecordException, IOException {
long readStartTime = System.nanoTime();
while (!allPartitionsFinished()) {
if (currentPartitionFinished()) {
moveToNextPartition();
continue;
}
if (this.messageIterator == null || !this.messageIterator.hasNext()) {
try {
long fetchStartTime = System.nanoTime();
this.messageIterator = fetchNextMessageBuffer();
this.currentPartitionFetchMessageBufferTime += System.nanoTime() - fetchStartTime;
} catch (Exception e) {
LOG.error(String.format("Failed to fetch next message buffer for partition %s. Will skip this partition.", getCurrentPartition()), e);
moveToNextPartition();
continue;
}
if (this.messageIterator == null || !this.messageIterator.hasNext()) {
moveToNextPartition();
continue;
}
}
while (!currentPartitionFinished()) {
if (!this.messageIterator.hasNext()) {
break;
}
KafkaConsumerRecord nextValidMessage = this.messageIterator.next();
// until we get to x.
if (nextValidMessage.getOffset() < this.nextWatermark.get(this.currentPartitionIdx)) {
continue;
}
this.nextWatermark.set(this.currentPartitionIdx, nextValidMessage.getNextOffset());
try {
D record = null;
// track time for decode/convert depending on the record type
long decodeStartTime = System.nanoTime();
if (nextValidMessage instanceof ByteArrayBasedKafkaRecord) {
record = decodeRecord((ByteArrayBasedKafkaRecord) nextValidMessage);
} else if (nextValidMessage instanceof DecodeableKafkaRecord) {
// if value is null then this is a bad record that is returned for further error handling, so raise an error
if (((DecodeableKafkaRecord) nextValidMessage).getValue() == null) {
throw new DataRecordException("Could not decode Kafka record");
}
// get value from decodeable record and convert to the output schema if necessary
record = convertRecord(((DecodeableKafkaRecord<?, D>) nextValidMessage).getValue());
} else {
throw new IllegalStateException("Unsupported KafkaConsumerRecord type. The returned record can either be ByteArrayBasedKafkaRecord" + " or DecodeableKafkaRecord");
}
this.currentPartitionDecodeRecordTime += System.nanoTime() - decodeStartTime;
this.currentPartitionRecordCount++;
this.currentPartitionTotalSize += nextValidMessage.getValueSizeInBytes();
this.currentPartitionReadRecordTime += System.nanoTime() - readStartTime;
return record;
} catch (Throwable t) {
this.errorPartitions.add(this.currentPartitionIdx);
this.undecodableMessageCount++;
if (shouldLogError()) {
LOG.error(String.format("A record from partition %s cannot be decoded.", getCurrentPartition()), t);
incrementErrorCount();
}
}
}
}
LOG.info("Finished pulling topic " + this.topicName);
this.currentPartitionReadRecordTime += System.nanoTime() - readStartTime;
return null;
}
use of org.apache.gobblin.source.extractor.DataRecordException in project incubator-gobblin by apache.
the class JdbcExtractor method getData.
@Override
public Iterator<JsonElement> getData(CommandOutput<?, ?> response) throws DataRecordException, IOException {
this.log.debug("Extract data records from resultset");
RecordSetList<JsonElement> recordSet = this.getNewRecordSetList();
if (response == null || !this.hasNextRecord()) {
return recordSet.iterator();
}
ResultSet resultset = null;
Iterator<ResultSet> itr = (Iterator<ResultSet>) response.getResults().values().iterator();
if (itr.hasNext()) {
resultset = itr.next();
} else {
throw new DataRecordException("Failed to get source record count from database - Resultset has no records");
}
try {
final ResultSetMetaData resultsetMetadata = resultset.getMetaData();
int batchSize = this.workUnitState.getPropAsInt(ConfigurationKeys.SOURCE_QUERYBASED_FETCH_SIZE, 0);
batchSize = (batchSize == 0 ? ConfigurationKeys.DEFAULT_SOURCE_FETCH_SIZE : batchSize);
int recordCount = 0;
while (resultset.next()) {
final int numColumns = resultsetMetadata.getColumnCount();
JsonObject jsonObject = new JsonObject();
for (int i = 1; i < numColumns + 1; i++) {
final String columnName = this.getHeaderRecord().get(i - 1);
jsonObject.addProperty(columnName, parseColumnAsString(resultset, resultsetMetadata, i));
}
recordSet.add(jsonObject);
recordCount++;
this.totalRecordCount++;
// Insert records in record set until it reaches the batch size
if (recordCount >= batchSize) {
this.log.info("Total number of records processed so far: " + this.totalRecordCount);
return recordSet.iterator();
}
}
this.setNextRecord(false);
this.log.info("Total number of records processed so far: " + this.totalRecordCount);
return recordSet.iterator();
} catch (Exception e) {
throw new DataRecordException("Failed to get records from database; error - " + e.getMessage(), e);
}
}
use of org.apache.gobblin.source.extractor.DataRecordException in project incubator-gobblin by apache.
the class JdbcExtractor method getRecordSet.
@Override
public Iterator<JsonElement> getRecordSet(String schema, String entity, WorkUnit workUnit, List<Predicate> predicateList) throws DataRecordException, IOException {
Iterator<JsonElement> rs = null;
List<Command> cmds;
try {
if (isFirstPull()) {
this.log.info("Get data recordset using JDBC");
cmds = this.getDataMetadata(schema, entity, workUnit, predicateList);
this.dataResponse = this.executePreparedSql(cmds);
this.setFirstPull(false);
}
rs = this.getData(this.dataResponse);
return rs;
} catch (Exception e) {
throw new DataRecordException("Failed to get record set using JDBC; error - " + e.getMessage(), e);
}
}
use of org.apache.gobblin.source.extractor.DataRecordException in project incubator-gobblin by apache.
the class SalesforceExtractor method getData.
@Override
public Iterator<JsonElement> getData(CommandOutput<?, ?> response) throws DataRecordException {
log.debug("Get data records from response");
String output;
Iterator<String> itr = (Iterator<String>) response.getResults().values().iterator();
if (itr.hasNext()) {
output = itr.next();
} else {
throw new DataRecordException("Failed to get data from salesforce; REST response has no output");
}
List<JsonElement> rs = Lists.newArrayList();
JsonElement element = GSON.fromJson(output, JsonObject.class);
JsonArray partRecords;
try {
JsonObject jsonObject = element.getAsJsonObject();
partRecords = jsonObject.getAsJsonArray("records");
if (jsonObject.get("done").getAsBoolean()) {
setPullStatus(false);
} else {
setNextUrl(this.sfConnector.getFullUri(jsonObject.get("nextRecordsUrl").getAsString().replaceAll(this.sfConnector.getServicesDataEnvPath(), "")));
}
JsonArray array = Utils.removeElementFromJsonArray(partRecords, "attributes");
Iterator<JsonElement> li = array.iterator();
while (li.hasNext()) {
JsonElement recordElement = li.next();
rs.add(recordElement);
}
return rs.iterator();
} catch (Exception e) {
throw new DataRecordException("Failed to get records from salesforce; error - " + e.getMessage(), e);
}
}
Aggregations