Search in sources :

Example 36 with CarbonColumn

use of org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn in project carbondata by apache.

the class CarbonLoadModelBuilder method build.

/**
 * build CarbonLoadModel for data loading
 * @param options Load options from user input
 * @param optionsFinal Load options that populated with default values for optional options
 * @param carbonLoadModel The output load model
 * @param hadoopConf hadoopConf is needed to read CSV header if there 'fileheader' is not set in
 *                   user provided load options
 * @param partitions partition name map to path
 * @param isDataFrame true if build for load for dataframe
 */
public void build(Map<String, String> options, Map<String, String> optionsFinal, CarbonLoadModel carbonLoadModel, Configuration hadoopConf, Map<String, String> partitions, boolean isDataFrame) throws InvalidLoadOptionException, IOException {
    carbonLoadModel.setTableName(table.getTableName());
    carbonLoadModel.setDatabaseName(table.getDatabaseName());
    carbonLoadModel.setTablePath(table.getTablePath());
    carbonLoadModel.setTableName(table.getTableName());
    carbonLoadModel.setCarbonTransactionalTable(table.isTransactionalTable());
    CarbonDataLoadSchema dataLoadSchema = new CarbonDataLoadSchema(table);
    // Need to fill dimension relation
    carbonLoadModel.setCarbonDataLoadSchema(dataLoadSchema);
    String sort_scope = optionsFinal.get("sort_scope");
    String bad_records_logger_enable = optionsFinal.get("bad_records_logger_enable");
    String bad_records_action = optionsFinal.get("bad_records_action");
    String bad_record_path = optionsFinal.get("bad_record_path");
    String global_sort_partitions = optionsFinal.get("global_sort_partitions");
    String timestampformat = optionsFinal.get("timestampformat");
    String dateFormat = optionsFinal.get("dateformat");
    String delimiter = optionsFinal.get("delimiter");
    String complex_delimiter_level1 = optionsFinal.get("complex_delimiter_level_1");
    String complex_delimiter_level2 = optionsFinal.get("complex_delimiter_level_2");
    String complex_delimiter_level3 = optionsFinal.get("complex_delimiter_level_3");
    String complex_delimiter_level4 = optionsFinal.get("complex_delimiter_level_4");
    validateDateTimeFormat(timestampformat, "TimestampFormat");
    validateDateTimeFormat(dateFormat, "DateFormat");
    if (Boolean.parseBoolean(bad_records_logger_enable) || LoggerAction.REDIRECT.name().equalsIgnoreCase(bad_records_action)) {
        if (!StringUtils.isEmpty(bad_record_path)) {
            bad_record_path = CarbonUtil.checkAndAppendHDFSUrl(bad_record_path);
        } else {
            throw new InvalidLoadOptionException("Cannot redirect bad records as bad record location is not provided.");
        }
    }
    carbonLoadModel.setBadRecordsLocation(bad_record_path);
    validateGlobalSortPartitions(global_sort_partitions);
    carbonLoadModel.setEscapeChar(checkDefaultValue(optionsFinal.get("escapechar"), "\\"));
    carbonLoadModel.setQuoteChar(CarbonUtil.unescapeChar(checkDefaultValue(optionsFinal.get("quotechar"), "\"")));
    carbonLoadModel.setCommentChar(checkDefaultValue(optionsFinal.get("commentchar"), "#"));
    String lineSeparator = CarbonUtil.unescapeChar(options.get("line_separator"));
    if (lineSeparator != null) {
        carbonLoadModel.setLineSeparator(lineSeparator);
    }
    // if there isn't file header in csv file and load sql doesn't provide FILEHEADER option,
    // we should use table schema to generate file header.
    String fileHeader = optionsFinal.get("fileheader");
    String headerOption = optionsFinal.get("header");
    if (StringUtils.isNotEmpty(headerOption)) {
        if (!headerOption.equalsIgnoreCase("true") && !headerOption.equalsIgnoreCase("false")) {
            throw new InvalidLoadOptionException("'header' option should be either 'true' or 'false'.");
        }
        // whether the csv file has file header, the default value is true
        if (Boolean.valueOf(headerOption)) {
            if (!StringUtils.isEmpty(fileHeader)) {
                throw new InvalidLoadOptionException("When 'header' option is true, 'fileheader' option is not required.");
            }
        } else {
            if (StringUtils.isEmpty(fileHeader)) {
                List<CarbonColumn> columns = table.getCreateOrderColumn();
                List<String> columnNames = new ArrayList<>();
                List<String> partitionColumns = new ArrayList<>();
                for (int i = 0; i < columns.size(); i++) {
                    columnNames.add(columns.get(i).getColName());
                }
                columnNames.addAll(partitionColumns);
                fileHeader = Strings.mkString(columnNames.toArray(new String[columnNames.size()]), ",");
            }
        }
    }
    String binaryDecoder = options.get("binary_decoder");
    carbonLoadModel.setBinaryDecoder(binaryDecoder);
    carbonLoadModel.setTimestampFormat(timestampformat);
    carbonLoadModel.setDateFormat(dateFormat);
    carbonLoadModel.setDefaultTimestampFormat(CarbonProperties.getInstance().getProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT, CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT));
    carbonLoadModel.setDefaultDateFormat(CarbonProperties.getInstance().getProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT));
    carbonLoadModel.setSerializationNullFormat(TableOptionConstant.SERIALIZATION_NULL_FORMAT.getName() + "," + optionsFinal.get("serialization_null_format"));
    carbonLoadModel.setBadRecordsLoggerEnable(TableOptionConstant.BAD_RECORDS_LOGGER_ENABLE.getName() + "," + bad_records_logger_enable);
    carbonLoadModel.setBadRecordsAction(TableOptionConstant.BAD_RECORDS_ACTION.getName() + "," + bad_records_action.toUpperCase());
    carbonLoadModel.setIsEmptyDataBadRecord(DataLoadProcessorConstants.IS_EMPTY_DATA_BAD_RECORD + "," + optionsFinal.get("is_empty_data_bad_record"));
    carbonLoadModel.setSkipEmptyLine(optionsFinal.get("skip_empty_line"));
    carbonLoadModel.setSortScope(sort_scope);
    if (global_sort_partitions == null) {
        global_sort_partitions = table.getGlobalSortPartitions();
    }
    carbonLoadModel.setGlobalSortPartitions(global_sort_partitions);
    if (delimiter.equalsIgnoreCase(complex_delimiter_level1) || complex_delimiter_level1.equalsIgnoreCase(complex_delimiter_level2) || delimiter.equalsIgnoreCase(complex_delimiter_level2) || delimiter.equalsIgnoreCase(complex_delimiter_level3)) {
        throw new InvalidLoadOptionException("Field Delimiter and Complex types delimiter are same");
    } else {
        carbonLoadModel.setComplexDelimiter(complex_delimiter_level1);
        carbonLoadModel.setComplexDelimiter(complex_delimiter_level2);
        carbonLoadModel.setComplexDelimiter(complex_delimiter_level3);
        carbonLoadModel.setComplexDelimiter(complex_delimiter_level4);
    }
    carbonLoadModel.setCsvDelimiter(CarbonUtil.unescapeChar(delimiter));
    carbonLoadModel.setCsvHeader(fileHeader);
    List<String> ignoreColumns = new ArrayList<>();
    if (!isDataFrame) {
        for (Map.Entry<String, String> partition : partitions.entrySet()) {
            if (partition.getValue() != null) {
                ignoreColumns.add(partition.getKey());
            }
        }
    }
    carbonLoadModel.setCsvHeaderColumns(LoadOption.getCsvHeaderColumns(carbonLoadModel, hadoopConf, ignoreColumns));
    int validatedMaxColumns = validateMaxColumns(carbonLoadModel.getCsvHeaderColumns(), optionsFinal.get("maxcolumns"));
    carbonLoadModel.setMaxColumns(String.valueOf(validatedMaxColumns));
    if (carbonLoadModel.isCarbonTransactionalTable()) {
        carbonLoadModel.readAndSetLoadMetadataDetails();
    }
    carbonLoadModel.setSortColumnsBoundsStr(optionsFinal.get("sort_column_bounds"));
    carbonLoadModel.setLoadMinSize(optionsFinal.get(CarbonCommonConstants.CARBON_LOAD_MIN_SIZE_INMB));
    validateAndSetLoadMinSize(carbonLoadModel);
    validateAndSetColumnCompressor(carbonLoadModel);
    validateAndSetBinaryDecoder(carbonLoadModel);
    validateRangeColumn(optionsFinal, carbonLoadModel);
    carbonLoadModel.setMetrics(new DataLoadMetrics());
}
Also used : InvalidLoadOptionException(org.apache.carbondata.common.exceptions.sql.InvalidLoadOptionException) DataLoadMetrics(org.apache.carbondata.core.util.DataLoadMetrics) CarbonColumn(org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn) ArrayList(java.util.ArrayList) HashMap(java.util.HashMap) Map(java.util.Map)

Example 37 with CarbonColumn

use of org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn in project carbondata by apache.

the class DataLoadProcessBuilder method getReArrangedDataFields.

private static void getReArrangedDataFields(CarbonLoadModel loadModel, CarbonTable carbonTable, List<CarbonDimension> dimensions, List<CarbonMeasure> measures, List<DataField> complexDataFields, List<DataField> partitionColumns, List<DataField> dataFields) {
    // re-arrange the data fields, as partition column data will be present in the end
    List<ColumnSchema> partitionColumnSchemaList;
    if (carbonTable.getPartitionInfo() != null) {
        partitionColumnSchemaList = carbonTable.getPartitionInfo().getColumnSchemaList();
    } else {
        partitionColumnSchemaList = new ArrayList<>();
    }
    // 1.1 compatibility, dimensions will not have sort columns in the beginning in 1.1.
    // Need to keep at the beginning now
    List<DataField> sortDataFields = new ArrayList<>();
    List<DataField> noSortDataFields = new ArrayList<>();
    for (CarbonColumn column : dimensions) {
        DataField dataField = new DataField(column);
        if (column.isComplex()) {
            List<CarbonDimension> childDimensions = ((CarbonDimension) dataField.getColumn()).getListOfChildDimensions();
            for (CarbonDimension childDimension : childDimensions) {
                if (childDimension.getDataType() == DataTypes.DATE) {
                    childDimension.setDateFormat(loadModel.getDateFormat());
                } else if (childDimension.getDataType() == DataTypes.TIMESTAMP) {
                    childDimension.setTimestampFormat(loadModel.getTimestampFormat());
                }
            }
            if (partitionColumnSchemaList.size() != 0 && partitionColumnSchemaList.contains(column.getColumnSchema())) {
                partitionColumns.add(dataField);
            } else {
                complexDataFields.add(dataField);
            }
        } else {
            if (column.getDataType() == DataTypes.DATE) {
                dataField.setDateFormat(loadModel.getDateFormat());
                column.setDateFormat(loadModel.getDateFormat());
            } else if (column.getDataType() == DataTypes.TIMESTAMP) {
                dataField.setTimestampFormat(loadModel.getTimestampFormat());
                column.setTimestampFormat(loadModel.getTimestampFormat());
            }
            if (partitionColumnSchemaList.size() != 0 && partitionColumnSchemaList.contains(column.getColumnSchema())) {
                partitionColumns.add(dataField);
            } else {
                if (dataField.getColumn().getColumnSchema().isSortColumn()) {
                    sortDataFields.add(dataField);
                } else {
                    noSortDataFields.add(dataField);
                }
            }
        }
    }
    if (sortDataFields.size() != 0) {
        dataFields.addAll(sortDataFields);
    }
    if (noSortDataFields.size() != 0) {
        dataFields.addAll(noSortDataFields);
    }
    if (complexDataFields.size() != 0) {
        dataFields.addAll(complexDataFields);
    }
    for (CarbonColumn column : measures) {
        if (partitionColumnSchemaList.size() != 0 && partitionColumnSchemaList.contains(column.getColumnSchema())) {
            partitionColumns.add(new DataField(column));
        } else {
            // This dummy measure is added when no measure was present. We no need to load it.
            if (!(column.getColName().equals("default_dummy_measure"))) {
                dataFields.add(new DataField(column));
            }
        }
    }
    if (partitionColumns.size() != 0) {
        // add partition columns at the end
        // re-arrange the partition columns as per column schema
        List<DataField> reArrangedPartitionColumns = new ArrayList<>();
        for (ColumnSchema col : partitionColumnSchemaList) {
            for (DataField field : partitionColumns) {
                if (field.getColumn().getColumnSchema().equals(col)) {
                    reArrangedPartitionColumns.add(field);
                    break;
                }
            }
        }
        dataFields.addAll(reArrangedPartitionColumns);
    }
}
Also used : CarbonColumn(org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn) ArrayList(java.util.ArrayList) ColumnSchema(org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema) CarbonDimension(org.apache.carbondata.core.metadata.schema.table.column.CarbonDimension)

Example 38 with CarbonColumn

use of org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn in project carbondata by apache.

the class DataLoadProcessBuilder method getDataFields.

private static void getDataFields(CarbonLoadModel loadModel, List<CarbonDimension> dimensions, List<CarbonMeasure> measures, List<DataField> complexDataFields, List<DataField> dataFields) {
    // And then add complex data types and measures.
    for (CarbonColumn column : dimensions) {
        DataField dataField = new DataField(column);
        if (column.getDataType() == DataTypes.DATE) {
            dataField.setDateFormat(loadModel.getDateFormat());
            column.setDateFormat(loadModel.getDateFormat());
        } else if (column.getDataType() == DataTypes.TIMESTAMP) {
            dataField.setTimestampFormat(loadModel.getTimestampFormat());
            column.setTimestampFormat(loadModel.getTimestampFormat());
        }
        if (column.isComplex()) {
            complexDataFields.add(dataField);
            List<CarbonDimension> childDimensions = ((CarbonDimension) dataField.getColumn()).getListOfChildDimensions();
            for (CarbonDimension childDimension : childDimensions) {
                if (childDimension.getDataType() == DataTypes.DATE) {
                    childDimension.setDateFormat(loadModel.getDateFormat());
                } else if (childDimension.getDataType() == DataTypes.TIMESTAMP) {
                    childDimension.setTimestampFormat(loadModel.getTimestampFormat());
                }
            }
        } else {
            dataFields.add(dataField);
        }
    }
    dataFields.addAll(complexDataFields);
    for (CarbonColumn column : measures) {
        // This dummy measure is added when no measure was present. We no need to load it.
        if (!(column.getColName().equals("default_dummy_measure"))) {
            dataFields.add(new DataField(column));
        }
    }
}
Also used : CarbonColumn(org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn) CarbonDimension(org.apache.carbondata.core.metadata.schema.table.column.CarbonDimension)

Example 39 with CarbonColumn

use of org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn in project carbondata by apache.

the class SchemaGenerator method getMinMaxLength.

/**
 * Method to get the min max length of each column. It will return the length of only column
 * which will be cached
 *
 * @param segmentProperties
 * @param minMaxCacheColumns
 * @return
 */
private static int[] getMinMaxLength(SegmentProperties segmentProperties, List<CarbonColumn> minMaxCacheColumns) {
    int[] minMaxLen = null;
    if (null != minMaxCacheColumns) {
        minMaxLen = new int[minMaxCacheColumns.size()];
        int counter = 0;
        for (CarbonColumn column : minMaxCacheColumns) {
            minMaxLen[counter++] = segmentProperties.createColumnValueLength()[BlockletIndexUtil.getColumnOrdinal(segmentProperties, column)];
        }
    } else {
        minMaxLen = segmentProperties.createColumnValueLength();
    }
    return minMaxLen;
}
Also used : CarbonColumn(org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn)

Example 40 with CarbonColumn

use of org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn in project carbondata by apache.

the class BlockletIndexUtil method getMinMaxForColumnsToBeCached.

/**
 * Method to get the min/max values for columns to be cached
 *
 * @param segmentProperties
 * @param minMaxCacheColumns
 * @param minMaxValuesForAllColumns
 * @return
 */
public static byte[][] getMinMaxForColumnsToBeCached(SegmentProperties segmentProperties, List<CarbonColumn> minMaxCacheColumns, byte[][] minMaxValuesForAllColumns) {
    byte[][] minMaxValuesForColumnsToBeCached = minMaxValuesForAllColumns;
    if (null != minMaxCacheColumns) {
        minMaxValuesForColumnsToBeCached = new byte[minMaxCacheColumns.size()][];
        int counter = 0;
        for (CarbonColumn column : minMaxCacheColumns) {
            minMaxValuesForColumnsToBeCached[counter++] = minMaxValuesForAllColumns[getColumnOrdinal(segmentProperties, column)];
        }
    }
    return minMaxValuesForColumnsToBeCached;
}
Also used : CarbonColumn(org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn)

Aggregations

CarbonColumn (org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn)45 ArrayList (java.util.ArrayList)20 CarbonDimension (org.apache.carbondata.core.metadata.schema.table.column.CarbonDimension)14 CarbonMeasure (org.apache.carbondata.core.metadata.schema.table.column.CarbonMeasure)11 ColumnExpression (org.apache.carbondata.core.scan.expression.ColumnExpression)6 LiteralExpression (org.apache.carbondata.core.scan.expression.LiteralExpression)6 DataType (org.apache.carbondata.core.metadata.datatype.DataType)5 ColumnSchema (org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema)5 Expression (org.apache.carbondata.core.scan.expression.Expression)5 AndExpression (org.apache.carbondata.core.scan.expression.logical.AndExpression)5 InExpression (org.apache.carbondata.core.scan.expression.conditional.InExpression)4 TrueExpression (org.apache.carbondata.core.scan.expression.logical.TrueExpression)4 DataField (org.apache.carbondata.processing.loading.DataField)4 HashMap (java.util.HashMap)3 MockUp (mockit.MockUp)3 AbstractDictionaryCacheTest (org.apache.carbondata.core.cache.dictionary.AbstractDictionaryCacheTest)3 CarbonTable (org.apache.carbondata.core.metadata.schema.table.CarbonTable)3 Test (org.junit.Test)3 BufferedReader (java.io.BufferedReader)2 FileReader (java.io.FileReader)2