use of com.linkedin.pinot.core.data.GenericRow in project pinot by linkedin.
the class PinotSegmentToCsvConverter method convert.
@Override
public void convert() throws Exception {
PinotSegmentRecordReader recordReader = new PinotSegmentRecordReader(new File(_segmentDir));
try {
recordReader.init();
try (BufferedWriter recordWriter = new BufferedWriter(new FileWriter(_outputFile))) {
if (_withHeader) {
GenericRow row = recordReader.next();
recordWriter.write(StringUtils.join(row.getFieldNames(), _delimiter));
recordWriter.newLine();
recordReader.rewind();
}
while (recordReader.hasNext()) {
GenericRow row = recordReader.next();
String[] fields = row.getFieldNames();
List<String> record = new ArrayList<>(fields.length);
for (String field : fields) {
Object value = row.getValue(field);
if (value instanceof Object[]) {
record.add(StringUtils.join((Object[]) value, _listDelimiter));
} else {
record.add(value.toString());
}
}
recordWriter.write(StringUtils.join(record, _delimiter));
recordWriter.newLine();
}
}
} finally {
recordReader.close();
}
}
use of com.linkedin.pinot.core.data.GenericRow in project pinot by linkedin.
the class PinotSegmentToJsonConverter method convert.
@Override
public void convert() throws Exception {
PinotSegmentRecordReader recordReader = new PinotSegmentRecordReader(new File(_segmentDir));
try {
recordReader.init();
try (BufferedWriter recordWriter = new BufferedWriter(new FileWriter(_outputFile))) {
while (recordReader.hasNext()) {
GenericRow row = recordReader.next();
JSONObject record = new JSONObject();
for (String field : row.getFieldNames()) {
Object value = row.getValue(field);
if (value instanceof Object[]) {
record.put(field, new JSONArray(value));
} else {
record.put(field, value);
}
}
recordWriter.write(record.toString());
recordWriter.newLine();
}
}
} finally {
recordReader.close();
}
}
use of com.linkedin.pinot.core.data.GenericRow in project pinot by linkedin.
the class LLRealtimeSegmentDataManager method processKafkaEvents.
private void processKafkaEvents(Iterable<MessageAndOffset> messagesAndOffsets, Long highWatermark) {
Iterator<MessageAndOffset> msgIterator = messagesAndOffsets.iterator();
int indexedMessageCount = 0;
int kafkaMessageCount = 0;
boolean canTakeMore = true;
GenericRow decodedRow = null;
GenericRow transformedRow = null;
while (!_shouldStop && !endCriteriaReached() && msgIterator.hasNext()) {
if (!canTakeMore) {
// The RealtimeSegmentImpl that we are pushing rows into has indicated that it cannot accept any more
// rows. This can happen in one of two conditions:
// 1. We are in INITIAL_CONSUMING state, and we somehow exceeded the max number of rows we are allowed to consume
// for this row. Something is seriously wrong, because endCriteriaReached() should have returned true when
// we hit the row limit.
// Throw an exception.
//
// 2. We are in CATCHING_UP state, and we legally hit this error due to Kafka unclean leader election where
// offsets get changed with higher generation numbers for some pinot servers but not others. So, if another
// server (who got a larger kafka offset) asked us to catch up to that offset, but we are connected to a
// broker who has smaller offsets, then we may try to push more rows into the buffer than maximum. This
// is a rare case, and we really don't know how to handle this at this time.
// Throw an exception.
//
segmentLogger.error("Buffer full with {} rows consumed (row limit {})", _numRowsConsumed, _segmentMaxRowCount);
throw new RuntimeException("Realtime segment full");
}
// Index each message
MessageAndOffset messageAndOffset = msgIterator.next();
byte[] array = messageAndOffset.message().payload().array();
int offset = messageAndOffset.message().payload().arrayOffset();
int length = messageAndOffset.message().payloadSize();
decodedRow = GenericRow.createOrReuseRow(decodedRow);
decodedRow = _messageDecoder.decode(array, offset, length, decodedRow);
// Update lag metric on the first message of each batch
if (kafkaMessageCount == 0) {
long messageOffset = messageAndOffset.offset();
long offsetDifference = highWatermark - messageOffset;
_serverMetrics.setValueOfTableGauge(_metricKeyName, ServerGauge.KAFKA_PARTITION_OFFSET_LAG, offsetDifference);
}
if (decodedRow != null) {
transformedRow = GenericRow.createOrReuseRow(transformedRow);
transformedRow = _fieldExtractor.transform(decodedRow, transformedRow);
if (transformedRow != null) {
_serverMetrics.addMeteredTableValue(_metricKeyName, ServerMeter.REALTIME_ROWS_CONSUMED, 1);
indexedMessageCount++;
} else {
_serverMetrics.addMeteredTableValue(_metricKeyName, ServerMeter.INVALID_REALTIME_ROWS_DROPPED, 1);
}
canTakeMore = _realtimeSegment.index(transformedRow);
} else {
_serverMetrics.addMeteredTableValue(_metricKeyName, ServerMeter.INVALID_REALTIME_ROWS_DROPPED, 1);
}
_currentOffset = messageAndOffset.nextOffset();
_numRowsConsumed++;
kafkaMessageCount++;
}
updateCurrentDocumentCountMetrics();
if (kafkaMessageCount != 0) {
segmentLogger.debug("Indexed {} messages ({} messages read from Kafka) current offset {}", indexedMessageCount, kafkaMessageCount, _currentOffset);
_serverMetrics.setValueOfTableGauge(_metricKeyName, ServerGauge.HIGHEST_KAFKA_OFFSET_CONSUMED, _currentOffset);
} else {
// If there were no messages to be fetched from Kafka, wait for a little bit as to avoid hammering the
// Kafka broker
Uninterruptibles.sleepUninterruptibly(100, TimeUnit.MILLISECONDS);
}
}
use of com.linkedin.pinot.core.data.GenericRow in project pinot by linkedin.
the class RealtimeDimensionsSerDe method deSerialize.
public GenericRow deSerialize(ByteBuffer buffer) {
GenericRow row = new GenericRow();
Map<String, Object> rowValues = new HashMap<String, Object>();
for (String dimension : dataSchema.getDimensionNames()) {
int[] ret = deSerializeAndReturnDicIdsFor(dimension, buffer);
if (dataSchema.getFieldSpecFor(dimension).isSingleValueField()) {
rowValues.put(dimension, dictionaryMap.get(dimension).get(ret[0]));
} else {
Object[] mV = new Object[ret.length];
for (int i = 0; i < ret.length; i++) {
mV[i] = dictionaryMap.get(dimension).get(ret[i]);
}
rowValues.put(dimension, mV);
}
}
row.init(rowValues);
return row;
}
use of com.linkedin.pinot.core.data.GenericRow in project pinot by linkedin.
the class SegmentIndexCreationDriverImpl method buildRaw.
private void buildRaw() throws Exception {
// Count the number of documents and gather per-column statistics
LOGGER.debug("Start building StatsCollector!");
buildIndexCreationInfo();
LOGGER.info("Finished building StatsCollector!");
LOGGER.info("Collected stats for {} documents", totalDocs);
// Initialize the index creation using the per-column statistics information
indexCreator.init(config, segmentIndexCreationInfo, indexCreationInfoMap, dataSchema, tempIndexDir);
// Build the index
recordReader.rewind();
LOGGER.info("Start building IndexCreator!");
GenericRow readRow = new GenericRow();
GenericRow transformedRow = new GenericRow();
while (recordReader.hasNext()) {
long start = System.currentTimeMillis();
transformedRow = readNextRowSanitized(readRow, transformedRow);
long stop = System.currentTimeMillis();
indexCreator.indexRow(transformedRow);
long stop1 = System.currentTimeMillis();
totalRecordReadTime += (stop - start);
totalIndexTime += (stop1 - stop);
}
recordReader.close();
LOGGER.info("Finished records indexing in IndexCreator!");
int numErrors, numConversions, numNulls, numNullCols;
if ((numErrors = extractor.getTotalErrors()) > 0) {
LOGGER.warn("Index creator for schema {} had {} rows with errors", dataSchema.getSchemaName(), numErrors);
}
Map<String, Integer> errorCount = extractor.getErrorCount();
for (String column : errorCount.keySet()) {
if ((numErrors = errorCount.get(column)) > 0) {
LOGGER.info("Column {} had {} rows with errors", column, numErrors);
}
}
if ((numConversions = extractor.getTotalConversions()) > 0) {
LOGGER.info("Index creator for schema {} had {} rows with type conversions", dataSchema.getSchemaName(), numConversions);
}
if ((numNulls = extractor.getTotalNulls()) > 0) {
LOGGER.info("Index creator for schema {} had {} rows with null columns", dataSchema.getSchemaName(), numNulls);
}
if ((numNullCols = extractor.getTotalNullCols()) > 0) {
LOGGER.info("Index creator for schema {} had {} null columns", dataSchema.getSchemaName(), numNullCols);
}
handlePostCreation();
}
Aggregations