Search in sources :

Example 1 with TiChunkColumnVector

use of com.pingcap.tikv.columnar.TiChunkColumnVector in project tispark by pingcap.

the class CoprocessorIterator method getTiChunkIterator.

/**
 * Build a DAGIterator from TiDAGRequest and region tasks to get rows
 *
 * <p>When we are preforming a scan request using coveringIndex, {@link
 * com.pingcap.tidb.tipb.IndexScan} should be used to read index rows. In other circumstances,
 * {@link com.pingcap.tidb.tipb.TableScan} is used to scan table rows.
 *
 * @param req TiDAGRequest built
 * @param regionTasks a list or RegionTask each contains a task on a single region
 * @param session TiSession
 * @return a DAGIterator to be processed
 */
public static CoprocessorIterator<TiChunk> getTiChunkIterator(TiDAGRequest req, List<RegionTask> regionTasks, TiSession session, int numOfRows) {
    TiDAGRequest dagRequest = req.copy();
    return new DAGIterator<TiChunk>(dagRequest.buildTableScan(), regionTasks, session, SchemaInfer.create(dagRequest), dagRequest.getPushDownType(), dagRequest.getStoreType(), dagRequest.getStartTs().getVersion()) {

        @Override
        public TiChunk next() {
            DataType[] dataTypes = this.schemaInfer.getTypes().toArray(new DataType[0]);
            // TODO tiColumnarBatch is meant to be reused in the entire data loading process.
            if (this.encodeType == EncodeType.TypeDefault) {
                Row[] rows = new Row[numOfRows];
                int count = 0;
                for (int i = 0; i < rows.length && hasNext(); i++) {
                    rows[i] = rowReader.readRow(dataTypes);
                    count += 1;
                }
                TiRowColumnVector[] columnarVectors = new TiRowColumnVector[dataTypes.length];
                for (int i = 0; i < dataTypes.length; i++) {
                    columnarVectors[i] = new TiRowColumnVector(dataTypes[i], i, rows, count);
                }
                return new TiChunk(columnarVectors);
            } else if (this.encodeType == EncodeType.TypeChunk) {
                TiColumnVector[] columnarVectors = new TiColumnVector[dataTypes.length];
                List<List<TiChunkColumnVector>> childColumnVectors = new ArrayList<>();
                for (int i = 0; i < dataTypes.length; i++) {
                    childColumnVectors.add(new ArrayList<>());
                }
                int count = 0;
                // TODO(Zhexuan Yang) we need control memory limit in case of out of memory error
                while (count < numOfRows && hasNext()) {
                    for (int i = 0; i < dataTypes.length; i++) {
                        childColumnVectors.get(i).add(dataTypes[i].decodeChunkColumn(dataInput));
                    }
                    int size = childColumnVectors.get(0).size();
                    count += childColumnVectors.get(0).get(size - 1).numOfRows();
                    // left data should be trashed.
                    dataInput = new CodecDataInput(new byte[0]);
                }
                for (int i = 0; i < dataTypes.length; i++) {
                    columnarVectors[i] = new BatchedTiChunkColumnVector(childColumnVectors.get(i), count);
                }
                return new TiChunk(columnarVectors);
            } else {
                // reading column count
                long colCount = IntegerCodec.readUVarLong(dataInput);
                long numOfRows = IntegerCodec.readUVarLong(dataInput);
                TiColumnVector[] columnVectors = new TiColumnVector[(int) colCount];
                for (int columnIdx = 0; columnIdx < colCount; columnIdx++) {
                    // reading column name
                    long length = IntegerCodec.readUVarLong(dataInput);
                    for (int i = 0; i < length; i++) {
                        dataInput.readByte();
                    }
                    // reading type name
                    length = IntegerCodec.readUVarLong(dataInput);
                    byte[] utf8Bytes = new byte[(int) length];
                    for (int i = 0; i < length; i++) {
                        utf8Bytes[i] = dataInput.readByte();
                    }
                    String typeName = new String(utf8Bytes, StandardCharsets.UTF_8);
                    CHType type = CHTypeMapping.parseType(typeName);
                    columnVectors[columnIdx] = type.decode(dataInput, (int) numOfRows);
                // TODO this is workaround to bybass nullable type
                }
                dataInput = new CodecDataInput(new byte[0]);
                return new TiChunk(columnVectors);
            }
        }
    };
}
Also used : TiChunk(com.pingcap.tikv.columnar.TiChunk) BatchedTiChunkColumnVector(com.pingcap.tikv.columnar.BatchedTiChunkColumnVector) ArrayList(java.util.ArrayList) CHType(com.pingcap.tikv.columnar.datatypes.CHType) TiDAGRequest(com.pingcap.tikv.meta.TiDAGRequest) TiChunkColumnVector(com.pingcap.tikv.columnar.TiChunkColumnVector) BatchedTiChunkColumnVector(com.pingcap.tikv.columnar.BatchedTiChunkColumnVector) CodecDataInput(com.pingcap.tikv.codec.CodecDataInput) DataType(com.pingcap.tikv.types.DataType) ArrayList(java.util.ArrayList) List(java.util.List) Row(com.pingcap.tikv.row.Row) TiRowColumnVector(com.pingcap.tikv.columnar.TiRowColumnVector)

Example 2 with TiChunkColumnVector

use of com.pingcap.tikv.columnar.TiChunkColumnVector in project tispark by pingcap.

the class DataType method decodeChunkColumn.

// all data should be read in little endian.
public TiChunkColumnVector decodeChunkColumn(CodecDataInput cdi) {
    int numRows = readIntLittleEndian(cdi);
    int numNulls = readIntLittleEndian(cdi);
    assert (numRows >= 0) && (numNulls >= 0);
    int numNullBitmapBytes = (numRows + 7) / 8;
    byte[] nullBitMaps = new byte[numNullBitmapBytes];
    if (numNulls > 0) {
        cdi.readFully(nullBitMaps);
    } else {
        nullBitMaps = setAllNotNull(numNullBitmapBytes);
    }
    int numFixedBytes = getFixLen();
    int numDataBytes = numFixedBytes * numRows;
    int numOffsets;
    long[] offsets = null;
    // handle var element
    if (numFixedBytes == -1) {
        numOffsets = numRows + 1;
        // read numOffsets * 8 bytes array
        // and convert bytes to int64
        offsets = new long[numOffsets];
        for (int i = 0; i < numOffsets; i++) {
            offsets[i] = readLongLittleEndian(cdi);
        }
        numDataBytes = (int) offsets[numRows];
    }
    // TODO this costs a lot, we need to find a way to avoid.
    byte[] dataBuffer = new byte[numDataBytes];
    cdi.readFully(dataBuffer);
    ByteBuffer buffer = ByteBuffer.wrap(dataBuffer);
    buffer.order(LITTLE_ENDIAN);
    return new TiChunkColumnVector(this, numFixedBytes, numRows, numNulls, nullBitMaps, offsets, buffer);
}
Also used : TiChunkColumnVector(com.pingcap.tikv.columnar.TiChunkColumnVector) ByteBuffer(java.nio.ByteBuffer)

Aggregations

TiChunkColumnVector (com.pingcap.tikv.columnar.TiChunkColumnVector)2 CodecDataInput (com.pingcap.tikv.codec.CodecDataInput)1 BatchedTiChunkColumnVector (com.pingcap.tikv.columnar.BatchedTiChunkColumnVector)1 TiChunk (com.pingcap.tikv.columnar.TiChunk)1 TiRowColumnVector (com.pingcap.tikv.columnar.TiRowColumnVector)1 CHType (com.pingcap.tikv.columnar.datatypes.CHType)1 TiDAGRequest (com.pingcap.tikv.meta.TiDAGRequest)1 Row (com.pingcap.tikv.row.Row)1 DataType (com.pingcap.tikv.types.DataType)1 ByteBuffer (java.nio.ByteBuffer)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1